blob: e896aba4ccd020d8a9b927c2bdbacc55aca86ed5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinner910337b2011-10-03 03:20:16 +0200115#undef PyUnicode_READY
116#define PyUnicode_READY(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200119 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100120 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200121
Victor Stinnerc379ead2011-10-03 12:52:27 +0200122#define _PyUnicode_SHARE_UTF8(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
125 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
126#define _PyUnicode_SHARE_WSTR(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
129
Victor Stinner829c0ad2011-10-03 01:08:02 +0200130/* true if the Unicode object has an allocated UTF-8 memory block
131 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_HAS_UTF8_MEMORY(op) \
133 (assert(_PyUnicode_CHECK(op)), \
134 (!PyUnicode_IS_COMPACT_ASCII(op) \
135 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200136 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
137
Victor Stinner03490912011-10-03 23:45:12 +0200138/* true if the Unicode object has an allocated wstr memory block
139 (not shared with other data) */
140#define _PyUnicode_HAS_WSTR_MEMORY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (_PyUnicode_WSTR(op) && \
143 (!PyUnicode_IS_READY(op) || \
144 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
145
Victor Stinner910337b2011-10-03 03:20:16 +0200146/* Generic helper macro to convert characters of different types.
147 from_type and to_type have to be valid type names, begin and end
148 are pointers to the source characters which should be of type
149 "from_type *". to is a pointer of type "to_type *" and points to the
150 buffer where the result characters are written to. */
151#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
152 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200153 to_type *_to = (to_type *) to; \
154 const from_type *_iter = (begin); \
155 const from_type *_end = (end); \
156 Py_ssize_t n = (_end) - (_iter); \
157 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200158 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200159 while (_iter < (_unrolled_end)) { \
160 _to[0] = (to_type) _iter[0]; \
161 _to[1] = (to_type) _iter[1]; \
162 _to[2] = (to_type) _iter[2]; \
163 _to[3] = (to_type) _iter[3]; \
164 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 while (_iter < (_end)) \
167 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200168 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200169
Walter Dörwald16807132007-05-25 13:52:07 +0000170/* This dictionary holds all interned unicode strings. Note that references
171 to strings in this dictionary are *not* counted in the string's ob_refcnt.
172 When the interned string reaches a refcnt of 0 the string deallocation
173 function will delete the reference from this dictionary.
174
175 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000176 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000177*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200178static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 do { \
185 if (unicode_empty != NULL) \
186 Py_INCREF(unicode_empty); \
187 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 unicode_empty = PyUnicode_New(0, 0); \
189 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
192 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200193 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000195
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196#define _Py_RETURN_UNICODE_EMPTY() \
197 do { \
198 _Py_INCREF_UNICODE_EMPTY(); \
199 return unicode_empty; \
200 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200202/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200203static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200204
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205/* Single character Unicode strings in the Latin-1 range are being
206 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208
Christian Heimes190d79e2008-01-30 11:58:22 +0000209/* Fast detection of the most frequent whitespace characters */
210const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000212/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000214/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* case 0x000C: * FORM FEED */
216/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 1, 1, 1, 1, 1, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x001C: * FILE SEPARATOR */
220/* case 0x001D: * GROUP SEPARATOR */
221/* case 0x001E: * RECORD SEPARATOR */
222/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000223 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 1, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000229
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000238};
239
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200240/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200242static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100243static int unicode_modifiable(PyObject *unicode);
244
Victor Stinnerfe226c02011-10-03 03:52:20 +0200245
Alexander Belopolsky40018472011-02-26 01:02:56 +0000246static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200248static PyObject *
249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
250static PyObject *
251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
252
253static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000255 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100256 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259static void
260raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300261 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100262 PyObject *unicode,
263 Py_ssize_t startpos, Py_ssize_t endpos,
264 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000265
Christian Heimes190d79e2008-01-30 11:58:22 +0000266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* 0x000B, * LINE TABULATION */
271/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000273 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* 0x001C, * FILE SEPARATOR */
276/* 0x001D, * GROUP SEPARATOR */
277/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 1, 1, 1, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000283
Benjamin Peterson14339b62009-01-31 16:36:08 +0000284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000292};
293
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000297PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000299#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 /* This is actually an illegal character, so it should
303 not be passed to unichr. */
304 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000305#endif
306}
307
Victor Stinner910337b2011-10-03 03:20:16 +0200308#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200309int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100310_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200311{
312 PyASCIIObject *ascii;
313 unsigned int kind;
314
315 assert(PyUnicode_Check(op));
316
317 ascii = (PyASCIIObject *)op;
318 kind = ascii->state.kind;
319
Victor Stinnera3b334d2011-10-03 13:53:37 +0200320 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(ascii->state.ready == 1);
323 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200325 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200326 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200327
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 if (ascii->state.compact == 1) {
329 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200334 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 }
337 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340 data = unicode->data.any;
341 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->length == 0);
343 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ascii == 0);
346 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 assert(ascii->wstr != NULL);
349 assert(data == NULL);
350 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200351 }
352 else {
353 assert(kind == PyUnicode_1BYTE_KIND
354 || kind == PyUnicode_2BYTE_KIND
355 || kind == PyUnicode_4BYTE_KIND);
356 assert(ascii->state.compact == 0);
357 assert(ascii->state.ready == 1);
358 assert(data != NULL);
359 if (ascii->state.ascii) {
360 assert (compact->utf8 == data);
361 assert (compact->utf8_length == ascii->length);
362 }
363 else
364 assert (compact->utf8 != data);
365 }
366 }
367 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 if (
369#if SIZEOF_WCHAR_T == 2
370 kind == PyUnicode_2BYTE_KIND
371#else
372 kind == PyUnicode_4BYTE_KIND
373#endif
374 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 {
376 assert(ascii->wstr == data);
377 assert(compact->wstr_length == ascii->length);
378 } else
379 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200381
382 if (compact->utf8 == NULL)
383 assert(compact->utf8_length == 0);
384 if (ascii->wstr == NULL)
385 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 /* check that the best kind is used */
388 if (check_content && kind != PyUnicode_WCHAR_KIND)
389 {
390 Py_ssize_t i;
391 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 void *data;
393 Py_UCS4 ch;
394
395 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 for (i=0; i < ascii->length; i++)
397 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200398 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200399 if (ch > maxchar)
400 maxchar = ch;
401 }
402 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100403 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 255);
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 else
408 assert(maxchar < 128);
409 }
Victor Stinner77faf692011-11-20 18:56:05 +0100410 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100412 assert(maxchar <= 0xFFFF);
413 }
414 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200415 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100416 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200418 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200419 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400420 return 1;
421}
Victor Stinner910337b2011-10-03 03:20:16 +0200422#endif
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424static PyObject*
425unicode_result_wchar(PyObject *unicode)
426{
427#ifndef Py_DEBUG
428 Py_ssize_t len;
429
430 assert(Py_REFCNT(unicode) == 1);
431
432 len = _PyUnicode_WSTR_LENGTH(unicode);
433 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100434 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200435 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 }
437
438 if (len == 1) {
439 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100440 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
442 Py_DECREF(unicode);
443 return latin1_char;
444 }
445 }
446
447 if (_PyUnicode_Ready(unicode) < 0) {
448 Py_XDECREF(unicode);
449 return NULL;
450 }
451#else
452 /* don't make the result ready in debug mode to ensure that the caller
453 makes the string ready before using it */
454 assert(_PyUnicode_CheckConsistency(unicode, 1));
455#endif
456 return unicode;
457}
458
459static PyObject*
460unicode_result_ready(PyObject *unicode)
461{
462 Py_ssize_t length;
463
464 length = PyUnicode_GET_LENGTH(unicode);
465 if (length == 0) {
466 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100467 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200468 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100469 }
470 return unicode_empty;
471 }
472
473 if (length == 1) {
474 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
475 if (ch < 256) {
476 PyObject *latin1_char = unicode_latin1[ch];
477 if (latin1_char != NULL) {
478 if (unicode != latin1_char) {
479 Py_INCREF(latin1_char);
480 Py_DECREF(unicode);
481 }
482 return latin1_char;
483 }
484 else {
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 Py_INCREF(unicode);
487 unicode_latin1[ch] = unicode;
488 return unicode;
489 }
490 }
491 }
492
493 assert(_PyUnicode_CheckConsistency(unicode, 1));
494 return unicode;
495}
496
497static PyObject*
498unicode_result(PyObject *unicode)
499{
500 assert(_PyUnicode_CHECK(unicode));
501 if (PyUnicode_IS_READY(unicode))
502 return unicode_result_ready(unicode);
503 else
504 return unicode_result_wchar(unicode);
505}
506
Victor Stinnerc4b49542011-12-11 22:44:26 +0100507static PyObject*
508unicode_result_unchanged(PyObject *unicode)
509{
510 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500511 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512 return NULL;
513 Py_INCREF(unicode);
514 return unicode;
515 }
516 else
517 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100518 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100519}
520
Victor Stinner3a50e702011-10-18 21:21:00 +0200521#ifdef HAVE_MBCS
522static OSVERSIONINFOEX winver;
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525/* --- Bloom Filters ----------------------------------------------------- */
526
527/* stuff to implement simple "bloom filters" for Unicode characters.
528 to keep things simple, we use a single bitmask, using the least 5
529 bits from each unicode characters as the bit index. */
530
531/* the linebreak mask is set up by Unicode_Init below */
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#if LONG_BIT >= 128
534#define BLOOM_WIDTH 128
535#elif LONG_BIT >= 64
536#define BLOOM_WIDTH 64
537#elif LONG_BIT >= 32
538#define BLOOM_WIDTH 32
539#else
540#error "LONG_BIT is smaller than 32"
541#endif
542
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543#define BLOOM_MASK unsigned long
544
Serhiy Storchaka05997252013-01-26 12:14:02 +0200545static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546
Antoine Pitrouf068f942010-01-13 14:19:12 +0000547#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
557 /* calculate simple bloom-style bitmask for a given unicode string */
558
Antoine Pitrouf068f942010-01-13 14:19:12 +0000559 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000560 Py_ssize_t i;
561
562 mask = 0;
563 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565
566 return mask;
567}
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569#define BLOOM_MEMBER(mask, chr, str) \
570 (BLOOM(mask, chr) \
571 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200573/* Compilation of templated routines */
574
575#include "stringlib/asciilib.h"
576#include "stringlib/fastsearch.h"
577#include "stringlib/partition.h"
578#include "stringlib/split.h"
579#include "stringlib/count.h"
580#include "stringlib/find.h"
581#include "stringlib/find_max_char.h"
582#include "stringlib/localeutil.h"
583#include "stringlib/undef.h"
584
585#include "stringlib/ucs1lib.h"
586#include "stringlib/fastsearch.h"
587#include "stringlib/partition.h"
588#include "stringlib/split.h"
589#include "stringlib/count.h"
590#include "stringlib/find.h"
591#include "stringlib/find_max_char.h"
592#include "stringlib/localeutil.h"
593#include "stringlib/undef.h"
594
595#include "stringlib/ucs2lib.h"
596#include "stringlib/fastsearch.h"
597#include "stringlib/partition.h"
598#include "stringlib/split.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
601#include "stringlib/find_max_char.h"
602#include "stringlib/localeutil.h"
603#include "stringlib/undef.h"
604
605#include "stringlib/ucs4lib.h"
606#include "stringlib/fastsearch.h"
607#include "stringlib/partition.h"
608#include "stringlib/split.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200615#include "stringlib/unicodedefs.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100619#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621/* --- Unicode Object ----------------------------------------------------- */
622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200624fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200626Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
627 Py_ssize_t size, Py_UCS4 ch,
628 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200630 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
631
632 switch (kind) {
633 case PyUnicode_1BYTE_KIND:
634 {
635 Py_UCS1 ch1 = (Py_UCS1) ch;
636 if (ch1 == ch)
637 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_2BYTE_KIND:
642 {
643 Py_UCS2 ch2 = (Py_UCS2) ch;
644 if (ch2 == ch)
645 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
646 else
647 return -1;
648 }
649 case PyUnicode_4BYTE_KIND:
650 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
651 default:
652 assert(0);
653 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655}
656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657static PyObject*
658resize_compact(PyObject *unicode, Py_ssize_t length)
659{
660 Py_ssize_t char_size;
661 Py_ssize_t struct_size;
662 Py_ssize_t new_size;
663 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100664 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200665 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100667 assert(PyUnicode_IS_COMPACT(unicode));
668
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200669 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100670 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 struct_size = sizeof(PyASCIIObject);
672 else
673 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200674 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
677 PyErr_NoMemory();
678 return NULL;
679 }
680 new_size = (struct_size + (length + 1) * char_size);
681
Victor Stinner84def372011-12-11 20:04:56 +0100682 _Py_DEC_REFTOTAL;
683 _Py_ForgetReference(unicode);
684
685 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
686 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100687 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 PyErr_NoMemory();
689 return NULL;
690 }
Victor Stinner84def372011-12-11 20:04:56 +0100691 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200695 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100697 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200698 _PyUnicode_WSTR_LENGTH(unicode) = length;
699 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100700 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
701 PyObject_DEL(_PyUnicode_WSTR(unicode));
702 _PyUnicode_WSTR(unicode) = NULL;
703 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
722
723 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200724 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
726 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727
728 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
729 PyErr_NoMemory();
730 return -1;
731 }
732 new_size = (length + 1) * char_size;
733
Victor Stinner7a9105a2011-12-12 00:13:42 +0100734 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
735 {
736 PyObject_DEL(_PyUnicode_UTF8(unicode));
737 _PyUnicode_UTF8(unicode) = NULL;
738 _PyUnicode_UTF8_LENGTH(unicode) = 0;
739 }
740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 data = (PyObject *)PyObject_REALLOC(data, new_size);
742 if (data == NULL) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200747 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200749 _PyUnicode_WSTR_LENGTH(unicode) = length;
750 }
751 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200752 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200753 _PyUnicode_UTF8_LENGTH(unicode) = length;
754 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 _PyUnicode_LENGTH(unicode) = length;
756 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200757 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200758 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 }
Victor Stinner95663112011-10-04 01:03:50 +0200762 assert(_PyUnicode_WSTR(unicode) != NULL);
763
764 /* check for integer overflow */
765 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
766 PyErr_NoMemory();
767 return -1;
768 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200770 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200772 if (!wstr) {
773 PyErr_NoMemory();
774 return -1;
775 }
776 _PyUnicode_WSTR(unicode) = wstr;
777 _PyUnicode_WSTR(unicode)[length] = 0;
778 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200779 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 return 0;
781}
782
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783static PyObject*
784resize_copy(PyObject *unicode, Py_ssize_t length)
785{
786 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100787 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100789
Benjamin Petersonbac79492012-01-14 13:34:47 -0500790 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792
793 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
794 if (copy == NULL)
795 return NULL;
796
797 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200798 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200800 }
801 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200802 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200804 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 if (w == NULL)
806 return NULL;
807 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
808 copy_length = Py_MIN(copy_length, length);
809 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
810 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200811 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
813}
814
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000816 Ux0000 terminated; some code (e.g. new_identifier)
817 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818
819 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
822*/
823
Alexander Belopolsky40018472011-02-26 01:02:56 +0000824static PyUnicodeObject *
825_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826{
827 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 if (length == 0 && unicode_empty != NULL) {
832 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200833 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 }
835
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000836 /* Ensure we won't overflow the size. */
837 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
838 return (PyUnicodeObject *)PyErr_NoMemory();
839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 if (length < 0) {
841 PyErr_SetString(PyExc_SystemError,
842 "Negative size passed to _PyUnicode_New");
843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000844 }
845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
847 if (unicode == NULL)
848 return NULL;
849 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
850 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
851 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100852 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000853 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100854 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856
Jeremy Hyltond8082792003-09-16 19:41:39 +0000857 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000858 * the caller fails before initializing str -- unicode_resize()
859 * reads str[0], and the Keep-Alive optimization can keep memory
860 * allocated for str alive across a call to unicode_dealloc(unicode).
861 * We don't want unicode_resize to read uninitialized memory in
862 * that case.
863 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 _PyUnicode_WSTR(unicode)[0] = 0;
865 _PyUnicode_WSTR(unicode)[length] = 0;
866 _PyUnicode_WSTR_LENGTH(unicode) = length;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = 0;
870 _PyUnicode_STATE(unicode).compact = 0;
871 _PyUnicode_STATE(unicode).ready = 0;
872 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200873 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200875 _PyUnicode_UTF8(unicode) = NULL;
876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100877 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878 return unicode;
879}
880
Victor Stinnerf42dc442011-10-02 23:33:16 +0200881static const char*
882unicode_kind_name(PyObject *unicode)
883{
Victor Stinner42dfd712011-10-03 14:41:45 +0200884 /* don't check consistency: unicode_kind_name() is called from
885 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886 if (!PyUnicode_IS_COMPACT(unicode))
887 {
888 if (!PyUnicode_IS_READY(unicode))
889 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600890 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "legacy ascii";
895 else
896 return "legacy latin1";
897 case PyUnicode_2BYTE_KIND:
898 return "legacy UCS2";
899 case PyUnicode_4BYTE_KIND:
900 return "legacy UCS4";
901 default:
902 return "<legacy invalid kind>";
903 }
904 }
905 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600906 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200907 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200908 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200909 return "ascii";
910 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200913 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200914 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200915 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200916 default:
917 return "<invalid compact kind>";
918 }
919}
920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922/* Functions wrapping macros for use in debugger */
923char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200924 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925}
926
927void *_PyUnicode_compact_data(void *unicode) {
928 return _PyUnicode_COMPACT_DATA(unicode);
929}
930void *_PyUnicode_data(void *unicode){
931 printf("obj %p\n", unicode);
932 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
933 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
934 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
935 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
936 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
937 return PyUnicode_DATA(unicode);
938}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200939
940void
941_PyUnicode_Dump(PyObject *op)
942{
943 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
945 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
946 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200947
Victor Stinnera849a4b2011-10-03 12:12:11 +0200948 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200949 {
950 if (ascii->state.ascii)
951 data = (ascii + 1);
952 else
953 data = (compact + 1);
954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 else
956 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200957 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
958
Victor Stinnera849a4b2011-10-03 12:12:11 +0200959 if (ascii->wstr == data)
960 printf("shared ");
961 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200962
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200964 printf(" (%zu), ", compact->wstr_length);
965 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
966 printf("shared ");
967 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200970}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971#endif
972
973PyObject *
974PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
975{
976 PyObject *obj;
977 PyCompactUnicodeObject *unicode;
978 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200979 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Py_ssize_t char_size;
982 Py_ssize_t struct_size;
983
984 /* Optimization for empty strings */
985 if (size == 0 && unicode_empty != NULL) {
986 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200987 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 }
989
Victor Stinner9e9d6892011-10-04 01:02:02 +0200990 is_ascii = 0;
991 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 struct_size = sizeof(PyCompactUnicodeObject);
993 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200994 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 char_size = 1;
996 is_ascii = 1;
997 struct_size = sizeof(PyASCIIObject);
998 }
999 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 }
1003 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001004 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 char_size = 2;
1006 if (sizeof(wchar_t) == 2)
1007 is_sharing = 1;
1008 }
1009 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001010 if (maxchar > MAX_UNICODE) {
1011 PyErr_SetString(PyExc_SystemError,
1012 "invalid maximum character passed to PyUnicode_New");
1013 return NULL;
1014 }
Victor Stinner8f825062012-04-27 13:55:39 +02001015 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 char_size = 4;
1017 if (sizeof(wchar_t) == 4)
1018 is_sharing = 1;
1019 }
1020
1021 /* Ensure we won't overflow the size. */
1022 if (size < 0) {
1023 PyErr_SetString(PyExc_SystemError,
1024 "Negative size passed to PyUnicode_New");
1025 return NULL;
1026 }
1027 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1028 return PyErr_NoMemory();
1029
1030 /* Duplicated allocation code from _PyObject_New() instead of a call to
1031 * PyObject_New() so we are able to allocate space for the object and
1032 * it's data buffer.
1033 */
1034 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1035 if (obj == NULL)
1036 return PyErr_NoMemory();
1037 obj = PyObject_INIT(obj, &PyUnicode_Type);
1038 if (obj == NULL)
1039 return NULL;
1040
1041 unicode = (PyCompactUnicodeObject *)obj;
1042 if (is_ascii)
1043 data = ((PyASCIIObject*)obj) + 1;
1044 else
1045 data = unicode + 1;
1046 _PyUnicode_LENGTH(unicode) = size;
1047 _PyUnicode_HASH(unicode) = -1;
1048 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 _PyUnicode_STATE(unicode).compact = 1;
1051 _PyUnicode_STATE(unicode).ready = 1;
1052 _PyUnicode_STATE(unicode).ascii = is_ascii;
1053 if (is_ascii) {
1054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 }
Victor Stinner8f825062012-04-27 13:55:39 +02001057 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 ((char*)data)[size] = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 else {
1065 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001066 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001067 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS4*)data)[size] = 0;
1071 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 }
1075 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 }
1079 }
Victor Stinner8f825062012-04-27 13:55:39 +02001080#ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier.
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 return obj;
1089}
1090
1091#if SIZEOF_WCHAR_T == 2
1092/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001094 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095
1096 This function assumes that unicode can hold one more code point than wstr
1097 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001098static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001100 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
1102 const wchar_t *iter;
1103 Py_UCS4 *ucs4_out;
1104
Victor Stinner910337b2011-10-03 03:20:16 +02001105 assert(unicode != NULL);
1106 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1108 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1109
1110 for (iter = begin; iter < end; ) {
1111 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001113 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1114 && (iter+1) < end
1115 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 {
Victor Stinner551ac952011-11-29 22:58:13 +01001117 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 iter += 2;
1119 }
1120 else {
1121 *ucs4_out++ = *iter;
1122 iter++;
1123 }
1124 }
1125 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1126 _PyUnicode_GET_LENGTH(unicode)));
1127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128}
1129#endif
1130
Victor Stinnercd9950f2011-10-02 00:34:53 +02001131static int
Victor Stinner488fa492011-12-12 00:01:39 +01001132unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133{
Victor Stinner488fa492011-12-12 00:01:39 +01001134 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001135 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001136 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001137 return -1;
1138 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return 0;
1140}
1141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142static int
1143_copy_characters(PyObject *to, Py_ssize_t to_start,
1144 PyObject *from, Py_ssize_t from_start,
1145 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 unsigned int from_kind, to_kind;
1148 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149
Victor Stinneree4544c2012-05-09 22:24:08 +02001150 assert(0 <= how_many);
1151 assert(0 <= from_start);
1152 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001155 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156
Victor Stinnerd3f08822012-05-29 12:57:52 +02001157 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(to));
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1160
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001161 if (how_many == 0)
1162 return 0;
1163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168
Victor Stinnerf1852262012-06-16 16:38:26 +02001169#ifdef Py_DEBUG
1170 if (!check_maxchar
1171 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1172 {
1173 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1174 Py_UCS4 ch;
1175 Py_ssize_t i;
1176 for (i=0; i < how_many; i++) {
1177 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1178 assert(ch <= to_maxchar);
1179 }
1180 }
1181#endif
1182
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001183 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001184 if (check_maxchar
1185 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1186 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001187 /* Writing Latin-1 characters into an ASCII string requires to
1188 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001189 Py_UCS4 max_char;
1190 max_char = ucs1lib_find_max_char(from_data,
1191 (Py_UCS1*)from_data + how_many);
1192 if (max_char >= 128)
1193 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001194 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001195 Py_MEMCPY((char*)to_data + to_kind * to_start,
1196 (char*)from_data + from_kind * from_start,
1197 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else if (from_kind == PyUnicode_1BYTE_KIND
1200 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS1, Py_UCS2,
1204 PyUnicode_1BYTE_DATA(from) + from_start,
1205 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_2BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 && to_kind == PyUnicode_4BYTE_KIND)
1211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS4,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_4BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
1219 else if (from_kind == PyUnicode_2BYTE_KIND
1220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS2, Py_UCS4,
1224 PyUnicode_2BYTE_DATA(from) + from_start,
1225 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001230 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1231
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 if (!check_maxchar) {
1233 if (from_kind == PyUnicode_2BYTE_KIND
1234 && to_kind == PyUnicode_1BYTE_KIND)
1235 {
1236 _PyUnicode_CONVERT_BYTES(
1237 Py_UCS2, Py_UCS1,
1238 PyUnicode_2BYTE_DATA(from) + from_start,
1239 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1240 PyUnicode_1BYTE_DATA(to) + to_start
1241 );
1242 }
1243 else if (from_kind == PyUnicode_4BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS4, Py_UCS1,
1248 PyUnicode_4BYTE_DATA(from) + from_start,
1249 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_2BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS2,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_2BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else {
1264 assert(0);
1265 return -1;
1266 }
1267 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001268 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001269 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 Py_ssize_t i;
1272
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 for (i=0; i < how_many; i++) {
1274 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001275 if (ch > to_maxchar)
1276 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1278 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 }
1280 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001281 return 0;
1282}
1283
Victor Stinnerd3f08822012-05-29 12:57:52 +02001284void
1285_PyUnicode_FastCopyCharacters(
1286 PyObject *to, Py_ssize_t to_start,
1287 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288{
1289 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1290}
1291
1292Py_ssize_t
1293PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1294 PyObject *from, Py_ssize_t from_start,
1295 Py_ssize_t how_many)
1296{
1297 int err;
1298
1299 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1300 PyErr_BadInternalCall();
1301 return -1;
1302 }
1303
Benjamin Petersonbac79492012-01-14 13:34:47 -05001304 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001305 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
1308
Victor Stinnerd3f08822012-05-29 12:57:52 +02001309 if (from_start < 0) {
1310 PyErr_SetString(PyExc_IndexError, "string index out of range");
1311 return -1;
1312 }
1313 if (to_start < 0) {
1314 PyErr_SetString(PyExc_IndexError, "string index out of range");
1315 return -1;
1316 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1318 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1319 PyErr_Format(PyExc_SystemError,
1320 "Cannot write %zi characters at %zi "
1321 "in a string of %zi characters",
1322 how_many, to_start, PyUnicode_GET_LENGTH(to));
1323 return -1;
1324 }
1325
1326 if (how_many == 0)
1327 return 0;
1328
Victor Stinner488fa492011-12-12 00:01:39 +01001329 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 return -1;
1331
1332 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1333 if (err) {
1334 PyErr_Format(PyExc_SystemError,
1335 "Cannot copy %s characters "
1336 "into a string of %s characters",
1337 unicode_kind_name(from),
1338 unicode_kind_name(to));
1339 return -1;
1340 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001341 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342}
1343
Victor Stinner17222162011-09-28 22:15:37 +02001344/* Find the maximum code point and count the number of surrogate pairs so a
1345 correct string length can be computed before converting a string to UCS4.
1346 This function counts single surrogates as a character and not as a pair.
1347
1348 Return 0 on success, or -1 on error. */
1349static int
1350find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1351 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352{
1353 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001354 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355
Victor Stinnerc53be962011-10-02 21:33:54 +02001356 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 *num_surrogates = 0;
1358 *maxchar = 0;
1359
1360 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001362 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1363 && (iter+1) < end
1364 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001366 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 iter += 2;
1369 }
1370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001372 {
1373 ch = *iter;
1374 iter++;
1375 }
1376 if (ch > *maxchar) {
1377 *maxchar = ch;
1378 if (*maxchar > MAX_UNICODE) {
1379 PyErr_Format(PyExc_ValueError,
1380 "character U+%x is not in range [U+0000; U+10ffff]",
1381 ch);
1382 return -1;
1383 }
1384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 return 0;
1387}
1388
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001389int
1390_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391{
1392 wchar_t *end;
1393 Py_UCS4 maxchar = 0;
1394 Py_ssize_t num_surrogates;
1395#if SIZEOF_WCHAR_T == 2
1396 Py_ssize_t length_wo_surrogates;
1397#endif
1398
Georg Brandl7597add2011-10-05 16:36:47 +02001399 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001400 strings were created using _PyObject_New() and where no canonical
1401 representation (the str field) has been set yet aka strings
1402 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001403 assert(_PyUnicode_CHECK(unicode));
1404 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001406 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001407 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001408 /* Actually, it should neither be interned nor be anything else: */
1409 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001412 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001413 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
1416 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1418 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 PyErr_NoMemory();
1420 return -1;
1421 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001422 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 _PyUnicode_WSTR(unicode), end,
1424 PyUnicode_1BYTE_DATA(unicode));
1425 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1426 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1427 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1428 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001429 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001430 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432 }
1433 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001434 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001435 _PyUnicode_UTF8(unicode) = NULL;
1436 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 PyObject_FREE(_PyUnicode_WSTR(unicode));
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1441 }
1442 /* In this case we might have to convert down from 4-byte native
1443 wchar_t to 2-byte unicode. */
1444 else if (maxchar < 65536) {
1445 assert(num_surrogates == 0 &&
1446 "FindMaxCharAndNumSurrogatePairs() messed up");
1447
Victor Stinner506f5922011-09-28 22:34:18 +02001448#if SIZEOF_WCHAR_T == 2
1449 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001451 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1452 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1453 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001454 _PyUnicode_UTF8(unicode) = NULL;
1455 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001456#else
1457 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001459 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyErr_NoMemory();
1462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 }
Victor Stinner506f5922011-09-28 22:34:18 +02001464 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1465 _PyUnicode_WSTR(unicode), end,
1466 PyUnicode_2BYTE_DATA(unicode));
1467 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1468 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1469 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001472 PyObject_FREE(_PyUnicode_WSTR(unicode));
1473 _PyUnicode_WSTR(unicode) = NULL;
1474 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1475#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 }
1477 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1478 else {
1479#if SIZEOF_WCHAR_T == 2
1480 /* in case the native representation is 2-bytes, we need to allocate a
1481 new normalized 4-byte version. */
1482 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1484 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 PyErr_NoMemory();
1486 return -1;
1487 }
1488 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1489 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001492 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1493 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001494 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyObject_FREE(_PyUnicode_WSTR(unicode));
1496 _PyUnicode_WSTR(unicode) = NULL;
1497 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1498#else
1499 assert(num_surrogates == 0);
1500
Victor Stinnerc3c74152011-10-02 20:39:55 +02001501 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001503 _PyUnicode_UTF8(unicode) = NULL;
1504 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1506#endif
1507 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1508 }
1509 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001510 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 return 0;
1512}
1513
Alexander Belopolsky40018472011-02-26 01:02:56 +00001514static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001515unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516{
Walter Dörwald16807132007-05-25 13:52:07 +00001517 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 case SSTATE_NOT_INTERNED:
1519 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001520
Benjamin Peterson29060642009-01-31 22:14:21 +00001521 case SSTATE_INTERNED_MORTAL:
1522 /* revive dead object temporarily for DelItem */
1523 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001524 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001525 Py_FatalError(
1526 "deletion of interned string failed");
1527 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001528
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 case SSTATE_INTERNED_IMMORTAL:
1530 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001531
Benjamin Peterson29060642009-01-31 22:14:21 +00001532 default:
1533 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001534 }
1535
Victor Stinner03490912011-10-03 23:45:12 +02001536 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001538 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001539 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001540 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1541 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001543 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544}
1545
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546#ifdef Py_DEBUG
1547static int
1548unicode_is_singleton(PyObject *unicode)
1549{
1550 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1551 if (unicode == unicode_empty)
1552 return 1;
1553 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1554 {
1555 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1556 if (ch < 256 && unicode_latin1[ch] == unicode)
1557 return 1;
1558 }
1559 return 0;
1560}
1561#endif
1562
Alexander Belopolsky40018472011-02-26 01:02:56 +00001563static int
Victor Stinner488fa492011-12-12 00:01:39 +01001564unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001565{
Victor Stinner488fa492011-12-12 00:01:39 +01001566 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567 if (Py_REFCNT(unicode) != 1)
1568 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001569 if (_PyUnicode_HASH(unicode) != -1)
1570 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001571 if (PyUnicode_CHECK_INTERNED(unicode))
1572 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001573 if (!PyUnicode_CheckExact(unicode))
1574 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001575#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001576 /* singleton refcount is greater than 1 */
1577 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001578#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 return 1;
1580}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001581
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582static int
1583unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1584{
1585 PyObject *unicode;
1586 Py_ssize_t old_length;
1587
1588 assert(p_unicode != NULL);
1589 unicode = *p_unicode;
1590
1591 assert(unicode != NULL);
1592 assert(PyUnicode_Check(unicode));
1593 assert(0 <= length);
1594
Victor Stinner910337b2011-10-03 03:20:16 +02001595 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 old_length = PyUnicode_WSTR_LENGTH(unicode);
1597 else
1598 old_length = PyUnicode_GET_LENGTH(unicode);
1599 if (old_length == length)
1600 return 0;
1601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001603 _Py_INCREF_UNICODE_EMPTY();
1604 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606 Py_DECREF(*p_unicode);
1607 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001648unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650{
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664}
1665
1666static int
1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001670 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001671 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677}
1678
Victor Stinnerc5166102012-02-22 13:55:02 +01001679/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001680
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001681 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001684static void
1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001687{
1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001690 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001691
1692 switch (kind) {
1693 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001694 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001695 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001696 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001697 }
1698 case PyUnicode_2BYTE_KIND: {
1699 Py_UCS2 *start = (Py_UCS2 *)data + index;
1700 Py_UCS2 *ucs2 = start;
1701 assert(index <= PyUnicode_GET_LENGTH(unicode));
1702
Victor Stinner184252a2012-06-16 02:57:41 +02001703 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001704 *ucs2 = (Py_UCS2)*str;
1705
1706 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001707 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001708 }
1709 default: {
1710 Py_UCS4 *start = (Py_UCS4 *)data + index;
1711 Py_UCS4 *ucs4 = start;
1712 assert(kind == PyUnicode_4BYTE_KIND);
1713 assert(index <= PyUnicode_GET_LENGTH(unicode));
1714
Victor Stinner184252a2012-06-16 02:57:41 +02001715 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001716 *ucs4 = (Py_UCS4)*str;
1717
1718 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 }
1720 }
1721}
1722
1723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724static PyObject*
1725get_latin1_char(unsigned char ch)
1726{
Victor Stinnera464fc12011-10-02 20:39:30 +02001727 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001729 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 if (!unicode)
1731 return NULL;
1732 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001733 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 unicode_latin1[ch] = unicode;
1735 }
1736 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001737 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738}
1739
Alexander Belopolsky40018472011-02-26 01:02:56 +00001740PyObject *
1741PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001743 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 Py_UCS4 maxchar = 0;
1745 Py_ssize_t num_surrogates;
1746
1747 if (u == NULL)
1748 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001750 /* If the Unicode data is known at construction time, we can apply
1751 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001754 if (size == 0)
1755 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 /* Single character Unicode objects in the Latin-1 range are
1758 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001759 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return get_latin1_char((unsigned char)*u);
1761
1762 /* If not empty and not single character, copy the Unicode data
1763 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001764 if (find_maxchar_surrogates(u, u + size,
1765 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 return NULL;
1767
Victor Stinner8faf8212011-12-08 22:14:11 +01001768 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 if (!unicode)
1770 return NULL;
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 switch (PyUnicode_KIND(unicode)) {
1773 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001774 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1776 break;
1777 case PyUnicode_2BYTE_KIND:
1778#if Py_UNICODE_SIZE == 2
1779 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1780#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001781 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1783#endif
1784 break;
1785 case PyUnicode_4BYTE_KIND:
1786#if SIZEOF_WCHAR_T == 2
1787 /* This is the only case which has to process surrogates, thus
1788 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001789 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790#else
1791 assert(num_surrogates == 0);
1792 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1793#endif
1794 break;
1795 default:
1796 assert(0 && "Impossible state");
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001799 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800}
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802PyObject *
1803PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001804{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001805 if (size < 0) {
1806 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 return NULL;
1809 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001810 if (u != NULL)
1811 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1812 else
1813 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001814}
1815
Alexander Belopolsky40018472011-02-26 01:02:56 +00001816PyObject *
1817PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001818{
1819 size_t size = strlen(u);
1820 if (size > PY_SSIZE_T_MAX) {
1821 PyErr_SetString(PyExc_OverflowError, "input too long");
1822 return NULL;
1823 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001824 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001825}
1826
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001827PyObject *
1828_PyUnicode_FromId(_Py_Identifier *id)
1829{
1830 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001831 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1832 strlen(id->string),
1833 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001834 if (!id->object)
1835 return NULL;
1836 PyUnicode_InternInPlace(&id->object);
1837 assert(!id->next);
1838 id->next = static_strings;
1839 static_strings = id;
1840 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001841 return id->object;
1842}
1843
1844void
1845_PyUnicode_ClearStaticStrings()
1846{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001847 _Py_Identifier *tmp, *s = static_strings;
1848 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001849 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001850 tmp = s->next;
1851 s->next = NULL;
1852 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001853 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001854 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855}
1856
Benjamin Peterson0df54292012-03-26 14:50:32 -04001857/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001858
Victor Stinnerd3f08822012-05-29 12:57:52 +02001859PyObject*
1860_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001861{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001862 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001863 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001864 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001865#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001866 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001867#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001868 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001869 }
Victor Stinner785938e2011-12-11 20:09:03 +01001870 unicode = PyUnicode_New(size, 127);
1871 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001872 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001873 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1874 assert(_PyUnicode_CheckConsistency(unicode, 1));
1875 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001876}
1877
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001878static Py_UCS4
1879kind_maxchar_limit(unsigned int kind)
1880{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001881 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001882 case PyUnicode_1BYTE_KIND:
1883 return 0x80;
1884 case PyUnicode_2BYTE_KIND:
1885 return 0x100;
1886 case PyUnicode_4BYTE_KIND:
1887 return 0x10000;
1888 default:
1889 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001890 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001891 }
1892}
1893
Victor Stinnere6abb482012-05-02 01:15:40 +02001894Py_LOCAL_INLINE(Py_UCS4)
1895align_maxchar(Py_UCS4 maxchar)
1896{
1897 if (maxchar <= 127)
1898 return 127;
1899 else if (maxchar <= 255)
1900 return 255;
1901 else if (maxchar <= 65535)
1902 return 65535;
1903 else
1904 return MAX_UNICODE;
1905}
1906
Victor Stinner702c7342011-10-05 13:50:52 +02001907static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001908_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001911 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001912
Serhiy Storchaka678db842013-01-26 12:16:36 +02001913 if (size == 0)
1914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001916 if (size == 1)
1917 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001919 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 if (!res)
1922 return NULL;
1923 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001924 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001926}
1927
Victor Stinnere57b1c02011-09-28 22:20:48 +02001928static PyObject*
1929_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930{
1931 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001932 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001933
Serhiy Storchaka678db842013-01-26 12:16:36 +02001934 if (size == 0)
1935 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001937 if (size == 1) {
1938 Py_UCS4 ch = u[0];
1939 if (ch < 256)
1940 return get_latin1_char((unsigned char)ch);
1941
1942 res = PyUnicode_New(1, ch);
1943 if (res == NULL)
1944 return NULL;
1945 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1946 assert(_PyUnicode_CheckConsistency(res, 1));
1947 return res;
1948 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001949
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001950 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!res)
1953 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 else {
1957 _PyUnicode_CONVERT_BYTES(
1958 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1959 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001960 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 return res;
1962}
1963
Victor Stinnere57b1c02011-09-28 22:20:48 +02001964static PyObject*
1965_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966{
1967 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001968 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969
Serhiy Storchaka678db842013-01-26 12:16:36 +02001970 if (size == 0)
1971 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001972 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001973 if (size == 1) {
1974 Py_UCS4 ch = u[0];
1975 if (ch < 256)
1976 return get_latin1_char((unsigned char)ch);
1977
1978 res = PyUnicode_New(1, ch);
1979 if (res == NULL)
1980 return NULL;
1981 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1982 assert(_PyUnicode_CheckConsistency(res, 1));
1983 return res;
1984 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001985
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001986 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 if (!res)
1989 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001990 if (max_char < 256)
1991 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1992 PyUnicode_1BYTE_DATA(res));
1993 else if (max_char < 0x10000)
1994 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1995 PyUnicode_2BYTE_DATA(res));
1996 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 return res;
2000}
2001
2002PyObject*
2003PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2004{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002005 if (size < 0) {
2006 PyErr_SetString(PyExc_ValueError, "size must be positive");
2007 return NULL;
2008 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002009 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002011 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002015 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002016 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017 PyErr_SetString(PyExc_SystemError, "invalid kind");
2018 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020}
2021
Victor Stinnerece58de2012-04-23 23:36:38 +02002022Py_UCS4
2023_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2024{
2025 enum PyUnicode_Kind kind;
2026 void *startptr, *endptr;
2027
2028 assert(PyUnicode_IS_READY(unicode));
2029 assert(0 <= start);
2030 assert(end <= PyUnicode_GET_LENGTH(unicode));
2031 assert(start <= end);
2032
2033 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2034 return PyUnicode_MAX_CHAR_VALUE(unicode);
2035
2036 if (start == end)
2037 return 127;
2038
Victor Stinner94d558b2012-04-27 22:26:58 +02002039 if (PyUnicode_IS_ASCII(unicode))
2040 return 127;
2041
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002043 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002044 endptr = (char *)startptr + end * kind;
2045 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002046 switch(kind) {
2047 case PyUnicode_1BYTE_KIND:
2048 return ucs1lib_find_max_char(startptr, endptr);
2049 case PyUnicode_2BYTE_KIND:
2050 return ucs2lib_find_max_char(startptr, endptr);
2051 case PyUnicode_4BYTE_KIND:
2052 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002053 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002054 assert(0);
2055 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002056 }
2057}
2058
Victor Stinner25a4b292011-10-06 12:31:55 +02002059/* Ensure that a string uses the most efficient storage, if it is not the
2060 case: create a new string with of the right kind. Write NULL into *p_unicode
2061 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002062static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002063unicode_adjust_maxchar(PyObject **p_unicode)
2064{
2065 PyObject *unicode, *copy;
2066 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002067 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002068 unsigned int kind;
2069
2070 assert(p_unicode != NULL);
2071 unicode = *p_unicode;
2072 assert(PyUnicode_IS_READY(unicode));
2073 if (PyUnicode_IS_ASCII(unicode))
2074 return;
2075
2076 len = PyUnicode_GET_LENGTH(unicode);
2077 kind = PyUnicode_KIND(unicode);
2078 if (kind == PyUnicode_1BYTE_KIND) {
2079 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 max_char = ucs1lib_find_max_char(u, u + len);
2081 if (max_char >= 128)
2082 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 }
2084 else if (kind == PyUnicode_2BYTE_KIND) {
2085 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002086 max_char = ucs2lib_find_max_char(u, u + len);
2087 if (max_char >= 256)
2088 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002089 }
2090 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002091 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002092 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 max_char = ucs4lib_find_max_char(u, u + len);
2094 if (max_char >= 0x10000)
2095 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002096 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002098 if (copy != NULL)
2099 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002100 Py_DECREF(unicode);
2101 *p_unicode = copy;
2102}
2103
Victor Stinner034f6cf2011-09-30 02:26:44 +02002104PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002105_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002106{
Victor Stinner87af4f22011-11-21 23:03:47 +01002107 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002109
Victor Stinner034f6cf2011-09-30 02:26:44 +02002110 if (!PyUnicode_Check(unicode)) {
2111 PyErr_BadInternalCall();
2112 return NULL;
2113 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002114 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002115 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002116
Victor Stinner87af4f22011-11-21 23:03:47 +01002117 length = PyUnicode_GET_LENGTH(unicode);
2118 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002119 if (!copy)
2120 return NULL;
2121 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2122
Victor Stinner87af4f22011-11-21 23:03:47 +01002123 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2124 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002125 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002126 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129
Victor Stinnerbc603d12011-10-02 01:00:40 +02002130/* Widen Unicode objects to larger buffers. Don't write terminating null
2131 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132
2133void*
2134_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2135{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136 Py_ssize_t len;
2137 void *result;
2138 unsigned int skind;
2139
Benjamin Petersonbac79492012-01-14 13:34:47 -05002140 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002141 return NULL;
2142
2143 len = PyUnicode_GET_LENGTH(s);
2144 skind = PyUnicode_KIND(s);
2145 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002146 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 return NULL;
2148 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002149 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 case PyUnicode_2BYTE_KIND:
2151 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2152 if (!result)
2153 return PyErr_NoMemory();
2154 assert(skind == PyUnicode_1BYTE_KIND);
2155 _PyUnicode_CONVERT_BYTES(
2156 Py_UCS1, Py_UCS2,
2157 PyUnicode_1BYTE_DATA(s),
2158 PyUnicode_1BYTE_DATA(s) + len,
2159 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002161 case PyUnicode_4BYTE_KIND:
2162 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2163 if (!result)
2164 return PyErr_NoMemory();
2165 if (skind == PyUnicode_2BYTE_KIND) {
2166 _PyUnicode_CONVERT_BYTES(
2167 Py_UCS2, Py_UCS4,
2168 PyUnicode_2BYTE_DATA(s),
2169 PyUnicode_2BYTE_DATA(s) + len,
2170 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 else {
2173 assert(skind == PyUnicode_1BYTE_KIND);
2174 _PyUnicode_CONVERT_BYTES(
2175 Py_UCS1, Py_UCS4,
2176 PyUnicode_1BYTE_DATA(s),
2177 PyUnicode_1BYTE_DATA(s) + len,
2178 result);
2179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 default:
2182 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 }
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186}
2187
2188static Py_UCS4*
2189as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2190 int copy_null)
2191{
2192 int kind;
2193 void *data;
2194 Py_ssize_t len, targetlen;
2195 if (PyUnicode_READY(string) == -1)
2196 return NULL;
2197 kind = PyUnicode_KIND(string);
2198 data = PyUnicode_DATA(string);
2199 len = PyUnicode_GET_LENGTH(string);
2200 targetlen = len;
2201 if (copy_null)
2202 targetlen++;
2203 if (!target) {
2204 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2205 PyErr_NoMemory();
2206 return NULL;
2207 }
2208 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2209 if (!target) {
2210 PyErr_NoMemory();
2211 return NULL;
2212 }
2213 }
2214 else {
2215 if (targetsize < targetlen) {
2216 PyErr_Format(PyExc_SystemError,
2217 "string is longer than the buffer");
2218 if (copy_null && 0 < targetsize)
2219 target[0] = 0;
2220 return NULL;
2221 }
2222 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002223 if (kind == PyUnicode_1BYTE_KIND) {
2224 Py_UCS1 *start = (Py_UCS1 *) data;
2225 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002227 else if (kind == PyUnicode_2BYTE_KIND) {
2228 Py_UCS2 *start = (Py_UCS2 *) data;
2229 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2230 }
2231 else {
2232 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 if (copy_null)
2236 target[len] = 0;
2237 return target;
2238}
2239
2240Py_UCS4*
2241PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2242 int copy_null)
2243{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002244 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 PyErr_BadInternalCall();
2246 return NULL;
2247 }
2248 return as_ucs4(string, target, targetsize, copy_null);
2249}
2250
2251Py_UCS4*
2252PyUnicode_AsUCS4Copy(PyObject *string)
2253{
2254 return as_ucs4(string, NULL, 0, 1);
2255}
2256
2257#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002258
Alexander Belopolsky40018472011-02-26 01:02:56 +00002259PyObject *
2260PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002264 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 PyErr_BadInternalCall();
2266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 }
2268
Martin v. Löwis790465f2008-04-05 20:41:37 +00002269 if (size == -1) {
2270 size = wcslen(w);
2271 }
2272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002277
Walter Dörwald346737f2007-05-31 10:44:43 +00002278static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002279makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2280 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002281{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 *fmt++ = '%';
2283 if (width) {
2284 if (zeropad)
2285 *fmt++ = '0';
2286 fmt += sprintf(fmt, "%d", width);
2287 }
2288 if (precision)
2289 fmt += sprintf(fmt, ".%d", precision);
2290 if (longflag)
2291 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002292 else if (longlongflag) {
2293 /* longlongflag should only ever be nonzero on machines with
2294 HAVE_LONG_LONG defined */
2295#ifdef HAVE_LONG_LONG
2296 char *f = PY_FORMAT_LONG_LONG;
2297 while (*f)
2298 *fmt++ = *f++;
2299#else
2300 /* we shouldn't ever get here */
2301 assert(0);
2302 *fmt++ = 'l';
2303#endif
2304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002305 else if (size_tflag) {
2306 char *f = PY_FORMAT_SIZE_T;
2307 while (*f)
2308 *fmt++ = *f++;
2309 }
2310 *fmt++ = c;
2311 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002312}
2313
Victor Stinner96865452011-03-01 23:44:09 +00002314/* helper for PyUnicode_FromFormatV() */
2315
2316static const char*
2317parse_format_flags(const char *f,
2318 int *p_width, int *p_precision,
2319 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2320{
2321 int width, precision, longflag, longlongflag, size_tflag;
2322
2323 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2324 f++;
2325 width = 0;
2326 while (Py_ISDIGIT((unsigned)*f))
2327 width = (width*10) + *f++ - '0';
2328 precision = 0;
2329 if (*f == '.') {
2330 f++;
2331 while (Py_ISDIGIT((unsigned)*f))
2332 precision = (precision*10) + *f++ - '0';
2333 if (*f == '%') {
2334 /* "%.3%s" => f points to "3" */
2335 f--;
2336 }
2337 }
2338 if (*f == '\0') {
2339 /* bogus format "%.1" => go backward, f points to "1" */
2340 f--;
2341 }
2342 if (p_width != NULL)
2343 *p_width = width;
2344 if (p_precision != NULL)
2345 *p_precision = precision;
2346
2347 /* Handle %ld, %lu, %lld and %llu. */
2348 longflag = 0;
2349 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002350 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002351
2352 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002353 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002354 longflag = 1;
2355 ++f;
2356 }
2357#ifdef HAVE_LONG_LONG
2358 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002359 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002360 longlongflag = 1;
2361 f += 2;
2362 }
2363#endif
2364 }
2365 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002366 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002367 size_tflag = 1;
2368 ++f;
2369 }
2370 if (p_longflag != NULL)
2371 *p_longflag = longflag;
2372 if (p_longlongflag != NULL)
2373 *p_longlongflag = longlongflag;
2374 if (p_size_tflag != NULL)
2375 *p_size_tflag = size_tflag;
2376 return f;
2377}
2378
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002379/* maximum number of characters required for output of %ld. 21 characters
2380 allows for 64-bit integers (in decimal) and an optional sign. */
2381#define MAX_LONG_CHARS 21
2382/* maximum number of characters required for output of %lld.
2383 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2384 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2385#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2386
Walter Dörwaldd2034312007-05-18 16:29:38 +00002387PyObject *
2388PyUnicode_FromFormatV(const char *format, va_list vargs)
2389{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 va_list count;
2391 Py_ssize_t callcount = 0;
2392 PyObject **callresults = NULL;
2393 PyObject **callresult = NULL;
2394 Py_ssize_t n = 0;
2395 int width = 0;
2396 int precision = 0;
2397 int zeropad;
2398 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002399 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002400 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002401 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2403 Py_UCS4 argmaxchar;
2404 Py_ssize_t numbersize = 0;
2405 char *numberresults = NULL;
2406 char *numberresult = NULL;
2407 Py_ssize_t i;
2408 int kind;
2409 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002410
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002411 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002412 /* step 1: count the number of %S/%R/%A/%s format specifications
2413 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2414 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002416 * also estimate a upper bound for all the number formats in the string,
2417 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002419 for (f = format; *f; f++) {
2420 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002421 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2423 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2424 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2425 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002428#ifdef HAVE_LONG_LONG
2429 if (longlongflag) {
2430 if (width < MAX_LONG_LONG_CHARS)
2431 width = MAX_LONG_LONG_CHARS;
2432 }
2433 else
2434#endif
2435 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2436 including sign. Decimal takes the most space. This
2437 isn't enough for octal. If a width is specified we
2438 need more (which we allocate later). */
2439 if (width < MAX_LONG_CHARS)
2440 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441
2442 /* account for the size + '\0' to separate numbers
2443 inside of the numberresults buffer */
2444 numbersize += (width + 1);
2445 }
2446 }
2447 else if ((unsigned char)*f > 127) {
2448 PyErr_Format(PyExc_ValueError,
2449 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2450 "string, got a non-ASCII byte: 0x%02x",
2451 (unsigned char)*f);
2452 return NULL;
2453 }
2454 }
2455 /* step 2: allocate memory for the results of
2456 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2457 if (callcount) {
2458 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2459 if (!callresults) {
2460 PyErr_NoMemory();
2461 return NULL;
2462 }
2463 callresult = callresults;
2464 }
2465 /* step 2.5: allocate memory for the results of formating numbers */
2466 if (numbersize) {
2467 numberresults = PyObject_Malloc(numbersize);
2468 if (!numberresults) {
2469 PyErr_NoMemory();
2470 goto fail;
2471 }
2472 numberresult = numberresults;
2473 }
2474
2475 /* step 3: format numbers and figure out how large a buffer we need */
2476 for (f = format; *f; f++) {
2477 if (*f == '%') {
2478 const char* p;
2479 int longflag;
2480 int longlongflag;
2481 int size_tflag;
2482 int numprinted;
2483
2484 p = f;
2485 zeropad = (f[1] == '0');
2486 f = parse_format_flags(f, &width, &precision,
2487 &longflag, &longlongflag, &size_tflag);
2488 switch (*f) {
2489 case 'c':
2490 {
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002491 int ordinal = va_arg(count, int);
2492 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2493 PyErr_SetString(PyExc_OverflowError,
2494 "%c arg not in range(0x110000)");
2495 goto fail;
2496 }
2497 maxchar = Py_MAX(maxchar, (Py_UCS4)ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 n++;
2499 break;
2500 }
2501 case '%':
2502 n++;
2503 break;
2504 case 'i':
2505 case 'd':
2506 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2507 width, precision, *f);
2508 if (longflag)
2509 numprinted = sprintf(numberresult, fmt,
2510 va_arg(count, long));
2511#ifdef HAVE_LONG_LONG
2512 else if (longlongflag)
2513 numprinted = sprintf(numberresult, fmt,
2514 va_arg(count, PY_LONG_LONG));
2515#endif
2516 else if (size_tflag)
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, Py_ssize_t));
2519 else
2520 numprinted = sprintf(numberresult, fmt,
2521 va_arg(count, int));
2522 n += numprinted;
2523 /* advance by +1 to skip over the '\0' */
2524 numberresult += (numprinted + 1);
2525 assert(*(numberresult - 1) == '\0');
2526 assert(*(numberresult - 2) != '\0');
2527 assert(numprinted >= 0);
2528 assert(numberresult <= numberresults + numbersize);
2529 break;
2530 case 'u':
2531 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2532 width, precision, 'u');
2533 if (longflag)
2534 numprinted = sprintf(numberresult, fmt,
2535 va_arg(count, unsigned long));
2536#ifdef HAVE_LONG_LONG
2537 else if (longlongflag)
2538 numprinted = sprintf(numberresult, fmt,
2539 va_arg(count, unsigned PY_LONG_LONG));
2540#endif
2541 else if (size_tflag)
2542 numprinted = sprintf(numberresult, fmt,
2543 va_arg(count, size_t));
2544 else
2545 numprinted = sprintf(numberresult, fmt,
2546 va_arg(count, unsigned int));
2547 n += numprinted;
2548 numberresult += (numprinted + 1);
2549 assert(*(numberresult - 1) == '\0');
2550 assert(*(numberresult - 2) != '\0');
2551 assert(numprinted >= 0);
2552 assert(numberresult <= numberresults + numbersize);
2553 break;
2554 case 'x':
2555 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2556 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2557 n += numprinted;
2558 numberresult += (numprinted + 1);
2559 assert(*(numberresult - 1) == '\0');
2560 assert(*(numberresult - 2) != '\0');
2561 assert(numprinted >= 0);
2562 assert(numberresult <= numberresults + numbersize);
2563 break;
2564 case 'p':
2565 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2566 /* %p is ill-defined: ensure leading 0x. */
2567 if (numberresult[1] == 'X')
2568 numberresult[1] = 'x';
2569 else if (numberresult[1] != 'x') {
2570 memmove(numberresult + 2, numberresult,
2571 strlen(numberresult) + 1);
2572 numberresult[0] = '0';
2573 numberresult[1] = 'x';
2574 numprinted += 2;
2575 }
2576 n += numprinted;
2577 numberresult += (numprinted + 1);
2578 assert(*(numberresult - 1) == '\0');
2579 assert(*(numberresult - 2) != '\0');
2580 assert(numprinted >= 0);
2581 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 case 's':
2584 {
2585 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002586 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002587 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002588 if (!str)
2589 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 /* since PyUnicode_DecodeUTF8 returns already flexible
2591 unicode objects, there is no need to call ready on them */
2592 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002593 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002595 /* Remember the str and switch to the next slot */
2596 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 break;
2598 }
2599 case 'U':
2600 {
2601 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002602 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (PyUnicode_READY(obj) == -1)
2604 goto fail;
2605 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002606 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 break;
2609 }
2610 case 'V':
2611 {
2612 PyObject *obj = va_arg(count, PyObject *);
2613 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002614 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002616 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002617 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 if (PyUnicode_READY(obj) == -1)
2619 goto fail;
2620 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002621 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002623 *callresult++ = NULL;
2624 }
2625 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002626 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002627 if (!str_obj)
2628 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002629 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002630 Py_DECREF(str_obj);
2631 goto fail;
2632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002634 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002636 *callresult++ = str_obj;
2637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 break;
2639 }
2640 case 'S':
2641 {
2642 PyObject *obj = va_arg(count, PyObject *);
2643 PyObject *str;
2644 assert(obj);
2645 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002646 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002648 if (PyUnicode_READY(str) == -1) {
2649 Py_DECREF(str);
2650 goto fail;
2651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002653 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* Remember the str and switch to the next slot */
2656 *callresult++ = str;
2657 break;
2658 }
2659 case 'R':
2660 {
2661 PyObject *obj = va_arg(count, PyObject *);
2662 PyObject *repr;
2663 assert(obj);
2664 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002665 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (PyUnicode_READY(repr) == -1) {
2668 Py_DECREF(repr);
2669 goto fail;
2670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002672 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 /* Remember the repr and switch to the next slot */
2675 *callresult++ = repr;
2676 break;
2677 }
2678 case 'A':
2679 {
2680 PyObject *obj = va_arg(count, PyObject *);
2681 PyObject *ascii;
2682 assert(obj);
2683 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002684 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002686 if (PyUnicode_READY(ascii) == -1) {
2687 Py_DECREF(ascii);
2688 goto fail;
2689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002691 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 /* Remember the repr and switch to the next slot */
2694 *callresult++ = ascii;
2695 break;
2696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 default:
2698 /* if we stumble upon an unknown
2699 formatting code, copy the rest of
2700 the format string to the output
2701 string. (we cannot just skip the
2702 code, since there's no way to know
2703 what's in the argument list) */
2704 n += strlen(p);
2705 goto expand;
2706 }
2707 } else
2708 n++;
2709 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 we don't have to resize the string.
2714 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002715 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 if (!string)
2717 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 kind = PyUnicode_KIND(string);
2719 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002725 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002726
2727 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2729 /* checking for == because the last argument could be a empty
2730 string, which causes i to point to end, the assert at the end of
2731 the loop */
2732 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002733
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 switch (*f) {
2735 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002736 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 const int ordinal = va_arg(vargs, int);
2738 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002740 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002741 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002746 {
Victor Stinner184252a2012-06-16 02:57:41 +02002747 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 /* unused, since we already have the result */
2749 if (*f == 'p')
2750 (void) va_arg(vargs, void *);
2751 else
2752 (void) va_arg(vargs, int);
2753 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002754 len = strlen(numberresult);
2755 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002757 i += len;
2758 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002759 assert(*numberresult == '\0');
2760 numberresult++;
2761 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002763 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002764 case 's':
2765 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002766 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002768 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 size = PyUnicode_GET_LENGTH(*callresult);
2770 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002771 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002773 /* We're done with the unicode()/repr() => forget it */
2774 Py_DECREF(*callresult);
2775 /* switch to next unicode()/repr() result */
2776 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 break;
2778 }
2779 case 'U':
2780 {
2781 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 Py_ssize_t size;
2783 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2784 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002785 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 break;
2788 }
2789 case 'V':
2790 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002792 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002793 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795 size = PyUnicode_GET_LENGTH(obj);
2796 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002797 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 size = PyUnicode_GET_LENGTH(*callresult);
2801 assert(PyUnicode_KIND(*callresult) <=
2802 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002803 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002805 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002807 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002808 break;
2809 }
2810 case 'S':
2811 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002812 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002814 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 /* unused, since we already have the result */
2816 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002818 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002819 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002820 /* We're done with the unicode()/repr() => forget it */
2821 Py_DECREF(*callresult);
2822 /* switch to next unicode()/repr() result */
2823 ++callresult;
2824 break;
2825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002826 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002828 break;
2829 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002830 {
2831 Py_ssize_t len = strlen(p);
2832 unicode_write_cstr(string, i, p, len);
2833 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002835 goto end;
2836 }
Victor Stinner184252a2012-06-16 02:57:41 +02002837 }
Victor Stinner1205f272010-09-11 00:54:47 +00002838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 else {
2840 assert(i < PyUnicode_GET_LENGTH(string));
2841 PyUnicode_WRITE(kind, data, i++, *f);
2842 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002844 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002845
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002847 if (callresults)
2848 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002849 if (numberresults)
2850 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002851 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002853 if (callresults) {
2854 PyObject **callresult2 = callresults;
2855 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002856 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002857 ++callresult2;
2858 }
2859 PyObject_Free(callresults);
2860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002861 if (numberresults)
2862 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002864}
2865
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866PyObject *
2867PyUnicode_FromFormat(const char *format, ...)
2868{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002869 PyObject* ret;
2870 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871
2872#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002873 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002874#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 ret = PyUnicode_FromFormatV(format, vargs);
2878 va_end(vargs);
2879 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002880}
2881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882#ifdef HAVE_WCHAR_H
2883
Victor Stinner5593d8a2010-10-02 11:11:27 +00002884/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2885 convert a Unicode object to a wide character string.
2886
Victor Stinnerd88d9832011-09-06 02:00:05 +02002887 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002888 character) required to convert the unicode object. Ignore size argument.
2889
Victor Stinnerd88d9832011-09-06 02:00:05 +02002890 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002891 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002892 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002893static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002894unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002895 wchar_t *w,
2896 Py_ssize_t size)
2897{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002898 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 const wchar_t *wstr;
2900
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002901 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 if (wstr == NULL)
2903 return -1;
2904
Victor Stinner5593d8a2010-10-02 11:11:27 +00002905 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002906 if (size > res)
2907 size = res + 1;
2908 else
2909 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002910 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002911 return res;
2912 }
2913 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002914 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002915}
2916
2917Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002918PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002919 wchar_t *w,
2920 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921{
2922 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 PyErr_BadInternalCall();
2924 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002926 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927}
2928
Victor Stinner137c34c2010-09-29 10:25:54 +00002929wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002930PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002931 Py_ssize_t *size)
2932{
2933 wchar_t* buffer;
2934 Py_ssize_t buflen;
2935
2936 if (unicode == NULL) {
2937 PyErr_BadInternalCall();
2938 return NULL;
2939 }
2940
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002941 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942 if (buflen == -1)
2943 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002945 PyErr_NoMemory();
2946 return NULL;
2947 }
2948
Victor Stinner137c34c2010-09-29 10:25:54 +00002949 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2950 if (buffer == NULL) {
2951 PyErr_NoMemory();
2952 return NULL;
2953 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002954 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002955 if (buflen == -1) {
2956 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002958 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 if (size != NULL)
2960 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002961 return buffer;
2962}
2963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965
Alexander Belopolsky40018472011-02-26 01:02:56 +00002966PyObject *
2967PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002970 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002971 PyErr_SetString(PyExc_ValueError,
2972 "chr() arg not in range(0x110000)");
2973 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002974 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002975
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002976 if ((Py_UCS4)ordinal < 256)
2977 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002979 v = PyUnicode_New(1, ordinal);
2980 if (v == NULL)
2981 return NULL;
2982 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002983 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002984 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002990 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002992 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002993 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002994 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 Py_INCREF(obj);
2996 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002997 }
2998 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 /* For a Unicode subtype that's not a Unicode object,
3000 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003001 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003002 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003003 PyErr_Format(PyExc_TypeError,
3004 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003005 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003006 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003007}
3008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003011 const char *encoding,
3012 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003013{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003014 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003015 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 PyErr_BadInternalCall();
3019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003021
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003022 /* Decoding bytes objects is the most common case and should be fast */
3023 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003024 if (PyBytes_GET_SIZE(obj) == 0)
3025 _Py_RETURN_UNICODE_EMPTY();
3026 v = PyUnicode_Decode(
3027 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3028 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003029 return v;
3030 }
3031
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003032 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyErr_SetString(PyExc_TypeError,
3034 "decoding str is not supported");
3035 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003036 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003038 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3039 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3040 PyErr_Format(PyExc_TypeError,
3041 "coercing to str: need bytes, bytearray "
3042 "or buffer-like object, %.80s found",
3043 Py_TYPE(obj)->tp_name);
3044 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003045 }
Tim Petersced69f82003-09-16 20:30:58 +00003046
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003047 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003048 PyBuffer_Release(&buffer);
3049 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003051
Serhiy Storchaka05997252013-01-26 12:14:02 +02003052 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003053 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055}
3056
Victor Stinner600d3be2010-06-10 12:00:55 +00003057/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003058 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3059 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003060int
3061_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003062 char *lower,
3063 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003065 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003066 char *l;
3067 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003069 if (encoding == NULL) {
3070 strcpy(lower, "utf-8");
3071 return 1;
3072 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003073 e = encoding;
3074 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003075 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003076 while (*e) {
3077 if (l == l_end)
3078 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003079 if (Py_ISUPPER(*e)) {
3080 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003081 }
3082 else if (*e == '_') {
3083 *l++ = '-';
3084 e++;
3085 }
3086 else {
3087 *l++ = *e++;
3088 }
3089 }
3090 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003091 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003092}
3093
Alexander Belopolsky40018472011-02-26 01:02:56 +00003094PyObject *
3095PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003096 Py_ssize_t size,
3097 const char *encoding,
3098 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003099{
3100 PyObject *buffer = NULL, *unicode;
3101 Py_buffer info;
3102 char lower[11]; /* Enough for any encoding shortcut */
3103
Fred Drakee4315f52000-05-09 19:53:39 +00003104 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003105 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003106 if ((strcmp(lower, "utf-8") == 0) ||
3107 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003108 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003109 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003110 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003111 (strcmp(lower, "iso-8859-1") == 0))
3112 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003113#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003114 else if (strcmp(lower, "mbcs") == 0)
3115 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003116#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003117 else if (strcmp(lower, "ascii") == 0)
3118 return PyUnicode_DecodeASCII(s, size, errors);
3119 else if (strcmp(lower, "utf-16") == 0)
3120 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3121 else if (strcmp(lower, "utf-32") == 0)
3122 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
3125 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003126 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003127 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003128 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003129 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 if (buffer == NULL)
3131 goto onError;
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02003132 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133 if (unicode == NULL)
3134 goto onError;
3135 if (!PyUnicode_Check(unicode)) {
3136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003137 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003138 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 Py_DECREF(unicode);
3140 goto onError;
3141 }
3142 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003143 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003144
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 Py_XDECREF(buffer);
3147 return NULL;
3148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150PyObject *
3151PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003152 const char *encoding,
3153 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154{
3155 PyObject *v;
3156
3157 if (!PyUnicode_Check(unicode)) {
3158 PyErr_BadArgument();
3159 goto onError;
3160 }
3161
3162 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003164
3165 /* Decode via the codec registry */
3166 v = PyCodec_Decode(unicode, encoding, errors);
3167 if (v == NULL)
3168 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003169 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003170
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003172 return NULL;
3173}
3174
Alexander Belopolsky40018472011-02-26 01:02:56 +00003175PyObject *
3176PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003177 const char *encoding,
3178 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003179{
3180 PyObject *v;
3181
3182 if (!PyUnicode_Check(unicode)) {
3183 PyErr_BadArgument();
3184 goto onError;
3185 }
3186
3187 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189
3190 /* Decode via the codec registry */
3191 v = PyCodec_Decode(unicode, encoding, errors);
3192 if (v == NULL)
3193 goto onError;
3194 if (!PyUnicode_Check(v)) {
3195 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003196 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197 Py_TYPE(v)->tp_name);
3198 Py_DECREF(v);
3199 goto onError;
3200 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003201 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202
Benjamin Peterson29060642009-01-31 22:14:21 +00003203 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204 return NULL;
3205}
3206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003209 Py_ssize_t size,
3210 const char *encoding,
3211 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212{
3213 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003214
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 unicode = PyUnicode_FromUnicode(s, size);
3216 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3219 Py_DECREF(unicode);
3220 return v;
3221}
3222
Alexander Belopolsky40018472011-02-26 01:02:56 +00003223PyObject *
3224PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003225 const char *encoding,
3226 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003227{
3228 PyObject *v;
3229
3230 if (!PyUnicode_Check(unicode)) {
3231 PyErr_BadArgument();
3232 goto onError;
3233 }
3234
3235 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003237
3238 /* Encode via the codec registry */
3239 v = PyCodec_Encode(unicode, encoding, errors);
3240 if (v == NULL)
3241 goto onError;
3242 return v;
3243
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245 return NULL;
3246}
3247
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003248static size_t
3249wcstombs_errorpos(const wchar_t *wstr)
3250{
3251 size_t len;
3252#if SIZEOF_WCHAR_T == 2
3253 wchar_t buf[3];
3254#else
3255 wchar_t buf[2];
3256#endif
3257 char outbuf[MB_LEN_MAX];
3258 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003259
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260#if SIZEOF_WCHAR_T == 2
3261 buf[2] = 0;
3262#else
3263 buf[1] = 0;
3264#endif
3265 start = wstr;
3266 while (*wstr != L'\0')
3267 {
3268 previous = wstr;
3269#if SIZEOF_WCHAR_T == 2
3270 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3271 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3272 {
3273 buf[0] = wstr[0];
3274 buf[1] = wstr[1];
3275 wstr += 2;
3276 }
3277 else {
3278 buf[0] = *wstr;
3279 buf[1] = 0;
3280 wstr++;
3281 }
3282#else
3283 buf[0] = *wstr;
3284 wstr++;
3285#endif
3286 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003287 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003288 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003289 }
3290
3291 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 return 0;
3293}
3294
Victor Stinner1b579672011-12-17 05:47:23 +01003295static int
3296locale_error_handler(const char *errors, int *surrogateescape)
3297{
3298 if (errors == NULL) {
3299 *surrogateescape = 0;
3300 return 0;
3301 }
3302
3303 if (strcmp(errors, "strict") == 0) {
3304 *surrogateescape = 0;
3305 return 0;
3306 }
3307 if (strcmp(errors, "surrogateescape") == 0) {
3308 *surrogateescape = 1;
3309 return 0;
3310 }
3311 PyErr_Format(PyExc_ValueError,
3312 "only 'strict' and 'surrogateescape' error handlers "
3313 "are supported, not '%s'",
3314 errors);
3315 return -1;
3316}
3317
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003318PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003319PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320{
3321 Py_ssize_t wlen, wlen2;
3322 wchar_t *wstr;
3323 PyObject *bytes = NULL;
3324 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003325 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003326 PyObject *exc;
3327 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003328 int surrogateescape;
3329
3330 if (locale_error_handler(errors, &surrogateescape) < 0)
3331 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003332
3333 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3334 if (wstr == NULL)
3335 return NULL;
3336
3337 wlen2 = wcslen(wstr);
3338 if (wlen2 != wlen) {
3339 PyMem_Free(wstr);
3340 PyErr_SetString(PyExc_TypeError, "embedded null character");
3341 return NULL;
3342 }
3343
3344 if (surrogateescape) {
3345 /* locale encoding with surrogateescape */
3346 char *str;
3347
3348 str = _Py_wchar2char(wstr, &error_pos);
3349 if (str == NULL) {
3350 if (error_pos == (size_t)-1) {
3351 PyErr_NoMemory();
3352 PyMem_Free(wstr);
3353 return NULL;
3354 }
3355 else {
3356 goto encode_error;
3357 }
3358 }
3359 PyMem_Free(wstr);
3360
3361 bytes = PyBytes_FromString(str);
3362 PyMem_Free(str);
3363 }
3364 else {
3365 size_t len, len2;
3366
3367 len = wcstombs(NULL, wstr, 0);
3368 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003369 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 goto encode_error;
3371 }
3372
3373 bytes = PyBytes_FromStringAndSize(NULL, len);
3374 if (bytes == NULL) {
3375 PyMem_Free(wstr);
3376 return NULL;
3377 }
3378
3379 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3380 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003381 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003382 goto encode_error;
3383 }
3384 PyMem_Free(wstr);
3385 }
3386 return bytes;
3387
3388encode_error:
3389 errmsg = strerror(errno);
3390 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003391
3392 if (error_pos == (size_t)-1)
3393 error_pos = wcstombs_errorpos(wstr);
3394
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003395 PyMem_Free(wstr);
3396 Py_XDECREF(bytes);
3397
Victor Stinner2f197072011-12-17 07:08:30 +01003398 if (errmsg != NULL) {
3399 size_t errlen;
3400 wstr = _Py_char2wchar(errmsg, &errlen);
3401 if (wstr != NULL) {
3402 reason = PyUnicode_FromWideChar(wstr, errlen);
3403 PyMem_Free(wstr);
3404 } else
3405 errmsg = NULL;
3406 }
3407 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003408 reason = PyUnicode_FromString(
3409 "wcstombs() encountered an unencodable "
3410 "wide character");
3411 if (reason == NULL)
3412 return NULL;
3413
3414 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3415 "locale", unicode,
3416 (Py_ssize_t)error_pos,
3417 (Py_ssize_t)(error_pos+1),
3418 reason);
3419 Py_DECREF(reason);
3420 if (exc != NULL) {
3421 PyCodec_StrictErrors(exc);
3422 Py_XDECREF(exc);
3423 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 return NULL;
3425}
3426
Victor Stinnerad158722010-10-27 00:25:46 +00003427PyObject *
3428PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003429{
Victor Stinner99b95382011-07-04 14:23:54 +02003430#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003431 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003432#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003433 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003434#else
Victor Stinner793b5312011-04-27 00:24:21 +02003435 PyInterpreterState *interp = PyThreadState_GET()->interp;
3436 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3437 cannot use it to encode and decode filenames before it is loaded. Load
3438 the Python codec requires to encode at least its own filename. Use the C
3439 version of the locale codec until the codec registry is initialized and
3440 the Python codec is loaded.
3441
3442 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3443 cannot only rely on it: check also interp->fscodec_initialized for
3444 subinterpreters. */
3445 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003446 return PyUnicode_AsEncodedString(unicode,
3447 Py_FileSystemDefaultEncoding,
3448 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003449 }
3450 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003451 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003452 }
Victor Stinnerad158722010-10-27 00:25:46 +00003453#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003454}
3455
Alexander Belopolsky40018472011-02-26 01:02:56 +00003456PyObject *
3457PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003458 const char *encoding,
3459 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460{
3461 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003462 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003463
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 if (!PyUnicode_Check(unicode)) {
3465 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 }
Fred Drakee4315f52000-05-09 19:53:39 +00003468
Fred Drakee4315f52000-05-09 19:53:39 +00003469 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003470 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003471 if ((strcmp(lower, "utf-8") == 0) ||
3472 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003473 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003474 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003476 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003477 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003478 }
Victor Stinner37296e82010-06-10 13:36:23 +00003479 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003480 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003481 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003483#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003484 else if (strcmp(lower, "mbcs") == 0)
3485 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003486#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003487 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003488 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490
3491 /* Encode via the codec registry */
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02003492 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003494 return NULL;
3495
3496 /* The normal path */
3497 if (PyBytes_Check(v))
3498 return v;
3499
3500 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003501 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003502 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003503 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003504
3505 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3506 "encoder %s returned bytearray instead of bytes",
3507 encoding);
3508 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003509 Py_DECREF(v);
3510 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003511 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003512
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003513 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3514 Py_DECREF(v);
3515 return b;
3516 }
3517
3518 PyErr_Format(PyExc_TypeError,
3519 "encoder did not return a bytes object (type=%.400s)",
3520 Py_TYPE(v)->tp_name);
3521 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003522 return NULL;
3523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003529{
3530 PyObject *v;
3531
3532 if (!PyUnicode_Check(unicode)) {
3533 PyErr_BadArgument();
3534 goto onError;
3535 }
3536
3537 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539
3540 /* Encode via the codec registry */
3541 v = PyCodec_Encode(unicode, encoding, errors);
3542 if (v == NULL)
3543 goto onError;
3544 if (!PyUnicode_Check(v)) {
3545 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003546 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003547 Py_TYPE(v)->tp_name);
3548 Py_DECREF(v);
3549 goto onError;
3550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003552
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 return NULL;
3555}
3556
Victor Stinner2f197072011-12-17 07:08:30 +01003557static size_t
3558mbstowcs_errorpos(const char *str, size_t len)
3559{
3560#ifdef HAVE_MBRTOWC
3561 const char *start = str;
3562 mbstate_t mbs;
3563 size_t converted;
3564 wchar_t ch;
3565
3566 memset(&mbs, 0, sizeof mbs);
3567 while (len)
3568 {
3569 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3570 if (converted == 0)
3571 /* Reached end of string */
3572 break;
3573 if (converted == (size_t)-1 || converted == (size_t)-2) {
3574 /* Conversion error or incomplete character */
3575 return str - start;
3576 }
3577 else {
3578 str += converted;
3579 len -= converted;
3580 }
3581 }
3582 /* failed to find the undecodable byte sequence */
3583 return 0;
3584#endif
3585 return 0;
3586}
3587
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003588PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003589PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003590 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003591{
3592 wchar_t smallbuf[256];
3593 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3594 wchar_t *wstr;
3595 size_t wlen, wlen2;
3596 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003597 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003598 size_t error_pos;
3599 char *errmsg;
3600 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003601
3602 if (locale_error_handler(errors, &surrogateescape) < 0)
3603 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003604
3605 if (str[len] != '\0' || len != strlen(str)) {
3606 PyErr_SetString(PyExc_TypeError, "embedded null character");
3607 return NULL;
3608 }
3609
3610 if (surrogateescape)
3611 {
3612 wstr = _Py_char2wchar(str, &wlen);
3613 if (wstr == NULL) {
3614 if (wlen == (size_t)-1)
3615 PyErr_NoMemory();
3616 else
3617 PyErr_SetFromErrno(PyExc_OSError);
3618 return NULL;
3619 }
3620
3621 unicode = PyUnicode_FromWideChar(wstr, wlen);
3622 PyMem_Free(wstr);
3623 }
3624 else {
3625#ifndef HAVE_BROKEN_MBSTOWCS
3626 wlen = mbstowcs(NULL, str, 0);
3627#else
3628 wlen = len;
3629#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003630 if (wlen == (size_t)-1)
3631 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003632 if (wlen+1 <= smallbuf_len) {
3633 wstr = smallbuf;
3634 }
3635 else {
3636 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3637 return PyErr_NoMemory();
3638
3639 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3640 if (!wstr)
3641 return PyErr_NoMemory();
3642 }
3643
3644 /* This shouldn't fail now */
3645 wlen2 = mbstowcs(wstr, str, wlen+1);
3646 if (wlen2 == (size_t)-1) {
3647 if (wstr != smallbuf)
3648 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003649 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003650 }
3651#ifdef HAVE_BROKEN_MBSTOWCS
3652 assert(wlen2 == wlen);
3653#endif
3654 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3655 if (wstr != smallbuf)
3656 PyMem_Free(wstr);
3657 }
3658 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003659
3660decode_error:
3661 errmsg = strerror(errno);
3662 assert(errmsg != NULL);
3663
3664 error_pos = mbstowcs_errorpos(str, len);
3665 if (errmsg != NULL) {
3666 size_t errlen;
3667 wstr = _Py_char2wchar(errmsg, &errlen);
3668 if (wstr != NULL) {
3669 reason = PyUnicode_FromWideChar(wstr, errlen);
3670 PyMem_Free(wstr);
3671 } else
3672 errmsg = NULL;
3673 }
3674 if (errmsg == NULL)
3675 reason = PyUnicode_FromString(
3676 "mbstowcs() encountered an invalid multibyte sequence");
3677 if (reason == NULL)
3678 return NULL;
3679
3680 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3681 "locale", str, len,
3682 (Py_ssize_t)error_pos,
3683 (Py_ssize_t)(error_pos+1),
3684 reason);
3685 Py_DECREF(reason);
3686 if (exc != NULL) {
3687 PyCodec_StrictErrors(exc);
3688 Py_XDECREF(exc);
3689 }
3690 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003691}
3692
3693PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003694PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695{
3696 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003697 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698}
3699
3700
3701PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003702PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003703 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003704 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3705}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003706
Christian Heimes5894ba72007-11-04 11:43:14 +00003707PyObject*
3708PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3709{
Victor Stinner99b95382011-07-04 14:23:54 +02003710#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003711 return PyUnicode_DecodeMBCS(s, size, NULL);
3712#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003713 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003714#else
Victor Stinner793b5312011-04-27 00:24:21 +02003715 PyInterpreterState *interp = PyThreadState_GET()->interp;
3716 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3717 cannot use it to encode and decode filenames before it is loaded. Load
3718 the Python codec requires to encode at least its own filename. Use the C
3719 version of the locale codec until the codec registry is initialized and
3720 the Python codec is loaded.
3721
3722 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3723 cannot only rely on it: check also interp->fscodec_initialized for
3724 subinterpreters. */
3725 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003726 return PyUnicode_Decode(s, size,
3727 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003728 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003729 }
3730 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003731 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003732 }
Victor Stinnerad158722010-10-27 00:25:46 +00003733#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003734}
3735
Martin v. Löwis011e8422009-05-05 04:43:17 +00003736
3737int
Antoine Pitrou13348842012-01-29 18:36:34 +01003738_PyUnicode_HasNULChars(PyObject* s)
3739{
3740 static PyObject *nul = NULL;
3741
3742 if (nul == NULL)
3743 nul = PyUnicode_FromStringAndSize("\0", 1);
3744 if (nul == NULL)
3745 return -1;
3746 return PyUnicode_Contains(s, nul);
3747}
3748
3749
3750int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003751PyUnicode_FSConverter(PyObject* arg, void* addr)
3752{
3753 PyObject *output = NULL;
3754 Py_ssize_t size;
3755 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003756 if (arg == NULL) {
3757 Py_DECREF(*(PyObject**)addr);
3758 return 1;
3759 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003760 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003761 output = arg;
3762 Py_INCREF(output);
3763 }
3764 else {
3765 arg = PyUnicode_FromObject(arg);
3766 if (!arg)
3767 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003768 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769 Py_DECREF(arg);
3770 if (!output)
3771 return 0;
3772 if (!PyBytes_Check(output)) {
3773 Py_DECREF(output);
3774 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3775 return 0;
3776 }
3777 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003778 size = PyBytes_GET_SIZE(output);
3779 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003780 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003781 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 Py_DECREF(output);
3783 return 0;
3784 }
3785 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003786 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003787}
3788
3789
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003790int
3791PyUnicode_FSDecoder(PyObject* arg, void* addr)
3792{
3793 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794 if (arg == NULL) {
3795 Py_DECREF(*(PyObject**)addr);
3796 return 1;
3797 }
3798 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003799 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003801 output = arg;
3802 Py_INCREF(output);
3803 }
3804 else {
3805 arg = PyBytes_FromObject(arg);
3806 if (!arg)
3807 return 0;
3808 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3809 PyBytes_GET_SIZE(arg));
3810 Py_DECREF(arg);
3811 if (!output)
3812 return 0;
3813 if (!PyUnicode_Check(output)) {
3814 Py_DECREF(output);
3815 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3816 return 0;
3817 }
3818 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003819 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003820 Py_DECREF(output);
3821 return 0;
3822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003824 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003825 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3826 Py_DECREF(output);
3827 return 0;
3828 }
3829 *(PyObject**)addr = output;
3830 return Py_CLEANUP_SUPPORTED;
3831}
3832
3833
Martin v. Löwis5b222132007-06-10 09:51:05 +00003834char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003836{
Christian Heimesf3863112007-11-22 07:46:41 +00003837 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003839 if (!PyUnicode_Check(unicode)) {
3840 PyErr_BadArgument();
3841 return NULL;
3842 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003843 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003844 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003846 if (PyUnicode_UTF8(unicode) == NULL) {
3847 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3849 if (bytes == NULL)
3850 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3852 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 Py_DECREF(bytes);
3854 return NULL;
3855 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003856 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3857 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3858 PyBytes_AS_STRING(bytes),
3859 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 Py_DECREF(bytes);
3861 }
3862
3863 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003864 *psize = PyUnicode_UTF8_LENGTH(unicode);
3865 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003866}
3867
3868char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3872}
3873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874Py_UNICODE *
3875PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 const unsigned char *one_byte;
3878#if SIZEOF_WCHAR_T == 4
3879 const Py_UCS2 *two_bytes;
3880#else
3881 const Py_UCS4 *four_bytes;
3882 const Py_UCS4 *ucs4_end;
3883 Py_ssize_t num_surrogates;
3884#endif
3885 wchar_t *w;
3886 wchar_t *wchar_end;
3887
3888 if (!PyUnicode_Check(unicode)) {
3889 PyErr_BadArgument();
3890 return NULL;
3891 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003892 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 assert(_PyUnicode_KIND(unicode) != 0);
3895 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003897 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3900 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 num_surrogates = 0;
3902
3903 for (; four_bytes < ucs4_end; ++four_bytes) {
3904 if (*four_bytes > 0xFFFF)
3905 ++num_surrogates;
3906 }
3907
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003908 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3909 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3910 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 PyErr_NoMemory();
3912 return NULL;
3913 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 w = _PyUnicode_WSTR(unicode);
3917 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3918 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3920 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003921 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003923 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3924 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925 }
3926 else
3927 *w = *four_bytes;
3928
3929 if (w > wchar_end) {
3930 assert(0 && "Miscalculated string end");
3931 }
3932 }
3933 *w = 0;
3934#else
3935 /* sizeof(wchar_t) == 4 */
3936 Py_FatalError("Impossible unicode object state, wstr and str "
3937 "should share memory already.");
3938 return NULL;
3939#endif
3940 }
3941 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003942 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3943 (_PyUnicode_LENGTH(unicode) + 1));
3944 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 PyErr_NoMemory();
3946 return NULL;
3947 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3949 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3950 w = _PyUnicode_WSTR(unicode);
3951 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3954 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 for (; w < wchar_end; ++one_byte, ++w)
3956 *w = *one_byte;
3957 /* null-terminate the wstr */
3958 *w = 0;
3959 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003960 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003962 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 for (; w < wchar_end; ++two_bytes, ++w)
3964 *w = *two_bytes;
3965 /* null-terminate the wstr */
3966 *w = 0;
3967#else
3968 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003969 PyObject_FREE(_PyUnicode_WSTR(unicode));
3970 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 Py_FatalError("Impossible unicode object state, wstr "
3972 "and str should share memory already.");
3973 return NULL;
3974#endif
3975 }
3976 else {
3977 assert(0 && "This should never happen.");
3978 }
3979 }
3980 }
3981 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 *size = PyUnicode_WSTR_LENGTH(unicode);
3983 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003984}
3985
Alexander Belopolsky40018472011-02-26 01:02:56 +00003986Py_UNICODE *
3987PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990}
3991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992
Alexander Belopolsky40018472011-02-26 01:02:56 +00003993Py_ssize_t
3994PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995{
3996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 goto onError;
3999 }
4000 return PyUnicode_GET_SIZE(unicode);
4001
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return -1;
4004}
4005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006Py_ssize_t
4007PyUnicode_GetLength(PyObject *unicode)
4008{
Victor Stinner07621332012-06-16 04:53:46 +02004009 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 PyErr_BadArgument();
4011 return -1;
4012 }
Victor Stinner07621332012-06-16 04:53:46 +02004013 if (PyUnicode_READY(unicode) == -1)
4014 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 return PyUnicode_GET_LENGTH(unicode);
4016}
4017
4018Py_UCS4
4019PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4020{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004021 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4022 PyErr_BadArgument();
4023 return (Py_UCS4)-1;
4024 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004025 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004026 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 return (Py_UCS4)-1;
4028 }
4029 return PyUnicode_READ_CHAR(unicode, index);
4030}
4031
4032int
4033PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4034{
4035 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004036 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 return -1;
4038 }
Victor Stinner488fa492011-12-12 00:01:39 +01004039 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004040 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004041 PyErr_SetString(PyExc_IndexError, "string index out of range");
4042 return -1;
4043 }
Victor Stinner488fa492011-12-12 00:01:39 +01004044 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004045 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004046 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4047 PyErr_SetString(PyExc_ValueError, "character out of range");
4048 return -1;
4049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4051 index, ch);
4052 return 0;
4053}
4054
Alexander Belopolsky40018472011-02-26 01:02:56 +00004055const char *
4056PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004057{
Victor Stinner42cb4622010-09-01 19:39:01 +00004058 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004059}
4060
Victor Stinner554f3f02010-06-16 23:33:54 +00004061/* create or adjust a UnicodeDecodeError */
4062static void
4063make_decode_exception(PyObject **exceptionObject,
4064 const char *encoding,
4065 const char *input, Py_ssize_t length,
4066 Py_ssize_t startpos, Py_ssize_t endpos,
4067 const char *reason)
4068{
4069 if (*exceptionObject == NULL) {
4070 *exceptionObject = PyUnicodeDecodeError_Create(
4071 encoding, input, length, startpos, endpos, reason);
4072 }
4073 else {
4074 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4075 goto onError;
4076 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4077 goto onError;
4078 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4079 goto onError;
4080 }
4081 return;
4082
4083onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004084 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004085}
4086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087/* error handling callback helper:
4088 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004089 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 and adjust various state variables.
4091 return 0 on success, -1 on error
4092*/
4093
Alexander Belopolsky40018472011-02-26 01:02:56 +00004094static int
4095unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004096 const char *encoding, const char *reason,
4097 const char **input, const char **inend, Py_ssize_t *startinpos,
4098 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004099 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004101 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102
4103 PyObject *restuple = NULL;
4104 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004105 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004107 Py_ssize_t requiredsize;
4108 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004109 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 int res = -1;
4111
Victor Stinner596a6c42011-11-09 00:02:18 +01004112 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4113 outsize = PyUnicode_GET_LENGTH(*output);
4114 else
4115 outsize = _PyUnicode_WSTR_LENGTH(*output);
4116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 *errorHandler = PyCodec_LookupError(errors);
4119 if (*errorHandler == NULL)
4120 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 }
4122
Victor Stinner554f3f02010-06-16 23:33:54 +00004123 make_decode_exception(exceptionObject,
4124 encoding,
4125 *input, *inend - *input,
4126 *startinpos, *endinpos,
4127 reason);
4128 if (*exceptionObject == NULL)
4129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130
4131 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4132 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004135 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 }
4138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004140 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004141 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142
4143 /* Copy back the bytes variables, which might have been modified by the
4144 callback */
4145 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4146 if (!inputobj)
4147 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004148 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004151 *input = PyBytes_AS_STRING(inputobj);
4152 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004153 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004154 /* we can DECREF safely, as the exception has another reference,
4155 so the object won't go away. */
4156 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004160 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4162 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004163 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164
Victor Stinner596a6c42011-11-09 00:02:18 +01004165 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4166 /* need more space? (at least enough for what we
4167 have+the replacement+the rest of the string (starting
4168 at the new input position), so we won't have to check space
4169 when there are no errors in the rest of the string) */
4170 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004171 requiredsize = *outpos;
4172 if (requiredsize > PY_SSIZE_T_MAX - replen)
4173 goto overflow;
4174 requiredsize += replen;
4175 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4176 goto overflow;
4177 requiredsize += insize - newpos;
Victor Stinner596a6c42011-11-09 00:02:18 +01004178 if (requiredsize > outsize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004179 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinner596a6c42011-11-09 00:02:18 +01004180 requiredsize = 2*outsize;
4181 if (unicode_resize(output, requiredsize) < 0)
4182 goto onError;
4183 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004184 if (unicode_widen(output, *outpos,
4185 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004187 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004188 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004190 else {
4191 wchar_t *repwstr;
4192 Py_ssize_t repwlen;
4193 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4194 if (repwstr == NULL)
4195 goto onError;
4196 /* need more space? (at least enough for what we
4197 have+the replacement+the rest of the string (starting
4198 at the new input position), so we won't have to check space
4199 when there are no errors in the rest of the string) */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004200 requiredsize = *outpos;
4201 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4202 goto overflow;
4203 requiredsize += repwlen;
4204 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4205 goto overflow;
4206 requiredsize += insize - newpos;
Victor Stinner596a6c42011-11-09 00:02:18 +01004207 if (requiredsize > outsize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004208 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinner596a6c42011-11-09 00:02:18 +01004209 requiredsize = 2*outsize;
4210 if (unicode_resize(output, requiredsize) < 0)
4211 goto onError;
4212 }
4213 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4214 *outpos += repwlen;
4215 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004217 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004218
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 /* we made it! */
4220 res = 0;
4221
Benjamin Peterson29060642009-01-31 22:14:21 +00004222 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 Py_XDECREF(restuple);
4224 return res;
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004225
4226 overflow:
4227 PyErr_SetString(PyExc_OverflowError,
4228 "decoded result is too long for a Python string");
4229 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230}
4231
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232/* --- UTF-7 Codec -------------------------------------------------------- */
4233
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234/* See RFC2152 for details. We encode conservatively and decode liberally. */
4235
4236/* Three simple macros defining base-64. */
4237
4238/* Is c a base-64 character? */
4239
4240#define IS_BASE64(c) \
4241 (((c) >= 'A' && (c) <= 'Z') || \
4242 ((c) >= 'a' && (c) <= 'z') || \
4243 ((c) >= '0' && (c) <= '9') || \
4244 (c) == '+' || (c) == '/')
4245
4246/* given that c is a base-64 character, what is its base-64 value? */
4247
4248#define FROM_BASE64(c) \
4249 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4250 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4251 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4252 (c) == '+' ? 62 : 63)
4253
4254/* What is the base-64 character of the bottom 6 bits of n? */
4255
4256#define TO_BASE64(n) \
4257 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4258
4259/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4260 * decoded as itself. We are permissive on decoding; the only ASCII
4261 * byte not decoding to itself is the + which begins a base64
4262 * string. */
4263
4264#define DECODE_DIRECT(c) \
4265 ((c) <= 127 && (c) != '+')
4266
4267/* The UTF-7 encoder treats ASCII characters differently according to
4268 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4269 * the above). See RFC2152. This array identifies these different
4270 * sets:
4271 * 0 : "Set D"
4272 * alphanumeric and '(),-./:?
4273 * 1 : "Set O"
4274 * !"#$%&*;<=>@[]^_`{|}
4275 * 2 : "whitespace"
4276 * ht nl cr sp
4277 * 3 : special (must be base64 encoded)
4278 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4279 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280
Tim Petersced69f82003-09-16 20:30:58 +00004281static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282char utf7_category[128] = {
4283/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4284 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4285/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4286 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4287/* sp ! " # $ % & ' ( ) * + , - . / */
4288 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4289/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4290 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4291/* @ A B C D E F G H I J K L M N O */
4292 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4293/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4295/* ` a b c d e f g h i j k l m n o */
4296 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4297/* p q r s t u v w x y z { | } ~ del */
4298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004299};
4300
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301/* ENCODE_DIRECT: this character should be encoded as itself. The
4302 * answer depends on whether we are encoding set O as itself, and also
4303 * on whether we are encoding whitespace as itself. RFC2152 makes it
4304 * clear that the answers to these questions vary between
4305 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307#define ENCODE_DIRECT(c, directO, directWS) \
4308 ((c) < 128 && (c) > 0 && \
4309 ((utf7_category[(c)] == 0) || \
4310 (directWS && (utf7_category[(c)] == 2)) || \
4311 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312
Alexander Belopolsky40018472011-02-26 01:02:56 +00004313PyObject *
4314PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004315 Py_ssize_t size,
4316 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004317{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4319}
4320
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321/* The decoder. The only state we preserve is our read position,
4322 * i.e. how many characters we have consumed. So if we end in the
4323 * middle of a shift sequence we have to back off the read position
4324 * and the output to the beginning of the sequence, otherwise we lose
4325 * all the shift state (seen bits, number of bits seen, high
4326 * surrogate). */
4327
Alexander Belopolsky40018472011-02-26 01:02:56 +00004328PyObject *
4329PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004330 Py_ssize_t size,
4331 const char *errors,
4332 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004333{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004335 Py_ssize_t startinpos;
4336 Py_ssize_t endinpos;
4337 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004339 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 const char *errmsg = "";
4341 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004342 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 unsigned int base64bits = 0;
4344 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004345 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346 PyObject *errorHandler = NULL;
4347 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004349 /* Start off assuming it's all ASCII. Widen later as necessary. */
4350 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 if (!unicode)
4352 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004353 if (size == 0) {
4354 if (consumed)
4355 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004356 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004357 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004359 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360 e = s + size;
4361
4362 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004363 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004364 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004365 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 if (inShift) { /* in a base-64 section */
4368 if (IS_BASE64(ch)) { /* consume a base-64 character */
4369 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4370 base64bits += 6;
4371 s++;
4372 if (base64bits >= 16) {
4373 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004374 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 base64bits -= 16;
4376 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004377 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 if (surrogate) {
4379 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004380 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4381 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004382 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4383 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004385 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 }
4387 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004388 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4389 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 }
4392 }
Victor Stinner551ac952011-11-29 22:58:13 +01004393 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 /* first surrogate */
4395 surrogate = outCh;
4396 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004398 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4399 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 }
4401 }
4402 }
4403 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 inShift = 0;
4405 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004407 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4408 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004409 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 if (base64bits > 0) { /* left-over bits */
4412 if (base64bits >= 6) {
4413 /* We've seen at least one base-64 character */
4414 errmsg = "partial character in shift sequence";
4415 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 else {
4418 /* Some bits remain; they should be zero */
4419 if (base64buffer != 0) {
4420 errmsg = "non-zero padding bits in shift sequence";
4421 goto utf7Error;
4422 }
4423 }
4424 }
4425 if (ch != '-') {
4426 /* '-' is absorbed; other terminating
4427 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004428 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4429 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 }
4432 }
4433 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 s++; /* consume '+' */
4436 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004438 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4439 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 }
4441 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004443 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004445 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446 }
4447 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4450 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 s++;
4452 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 else {
4454 startinpos = s-starts;
4455 s++;
4456 errmsg = "unexpected special character";
4457 goto utf7Error;
4458 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 endinpos = s-starts;
4462 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 errors, &errorHandler,
4464 "utf7", errmsg,
4465 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468 }
4469
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 /* end of string */
4471
4472 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4473 /* if we're in an inconsistent state, that's an error */
4474 if (surrogate ||
4475 (base64bits >= 6) ||
4476 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 endinpos = size;
4478 if (unicode_decode_call_errorhandler(
4479 errors, &errorHandler,
4480 "utf7", "unterminated shift sequence",
4481 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004482 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483 goto onError;
4484 if (s < e)
4485 goto restart;
4486 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488
4489 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004490 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004492 *consumed = startinpos;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004493 if (outpos != shiftOutStart &&
4494 PyUnicode_MAX_CHAR_VALUE(unicode) > 127) {
4495 PyObject *result = PyUnicode_FromKindAndData(
4496 PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4497 shiftOutStart);
4498 Py_DECREF(unicode);
4499 unicode = result;
4500 }
4501 outpos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 }
4503 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004504 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004506 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004508 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 goto onError;
4510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 Py_XDECREF(errorHandler);
4512 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004513 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 Py_XDECREF(errorHandler);
4517 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 Py_DECREF(unicode);
4519 return NULL;
4520}
4521
4522
Alexander Belopolsky40018472011-02-26 01:02:56 +00004523PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524_PyUnicode_EncodeUTF7(PyObject *str,
4525 int base64SetO,
4526 int base64WhiteSpace,
4527 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004529 int kind;
4530 void *data;
4531 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004532 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004534 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 unsigned int base64bits = 0;
4536 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 char * out;
4538 char * start;
4539
Benjamin Petersonbac79492012-01-14 13:34:47 -05004540 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004541 return NULL;
4542 kind = PyUnicode_KIND(str);
4543 data = PyUnicode_DATA(str);
4544 len = PyUnicode_GET_LENGTH(str);
4545
4546 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004549 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004550 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004551 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004552 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 if (v == NULL)
4554 return NULL;
4555
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004556 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004557 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004558 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 if (inShift) {
4561 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4562 /* shifting out */
4563 if (base64bits) { /* output remaining bits */
4564 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4565 base64buffer = 0;
4566 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567 }
4568 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 /* Characters not in the BASE64 set implicitly unshift the sequence
4570 so no '-' is required, except if the character is itself a '-' */
4571 if (IS_BASE64(ch) || ch == '-') {
4572 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 *out++ = (char) ch;
4575 }
4576 else {
4577 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004578 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 else { /* not in a shift sequence */
4581 if (ch == '+') {
4582 *out++ = '+';
4583 *out++ = '-';
4584 }
4585 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4586 *out++ = (char) ch;
4587 }
4588 else {
4589 *out++ = '+';
4590 inShift = 1;
4591 goto encode_char;
4592 }
4593 }
4594 continue;
4595encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004597 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004598
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 /* code first surrogate */
4600 base64bits += 16;
4601 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4602 while (base64bits >= 6) {
4603 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4604 base64bits -= 6;
4605 }
4606 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004607 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 base64bits += 16;
4610 base64buffer = (base64buffer << 16) | ch;
4611 while (base64bits >= 6) {
4612 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4613 base64bits -= 6;
4614 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004615 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 if (base64bits)
4617 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4618 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004620 if (_PyBytes_Resize(&v, out - start) < 0)
4621 return NULL;
4622 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004624PyObject *
4625PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4626 Py_ssize_t size,
4627 int base64SetO,
4628 int base64WhiteSpace,
4629 const char *errors)
4630{
4631 PyObject *result;
4632 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4633 if (tmp == NULL)
4634 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004635 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004636 base64WhiteSpace, errors);
4637 Py_DECREF(tmp);
4638 return result;
4639}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641#undef IS_BASE64
4642#undef FROM_BASE64
4643#undef TO_BASE64
4644#undef DECODE_DIRECT
4645#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647/* --- UTF-8 Codec -------------------------------------------------------- */
4648
Alexander Belopolsky40018472011-02-26 01:02:56 +00004649PyObject *
4650PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004651 Py_ssize_t size,
4652 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653{
Walter Dörwald69652032004-09-07 20:24:22 +00004654 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4655}
4656
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657#include "stringlib/asciilib.h"
4658#include "stringlib/codecs.h"
4659#include "stringlib/undef.h"
4660
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004661#include "stringlib/ucs1lib.h"
4662#include "stringlib/codecs.h"
4663#include "stringlib/undef.h"
4664
4665#include "stringlib/ucs2lib.h"
4666#include "stringlib/codecs.h"
4667#include "stringlib/undef.h"
4668
4669#include "stringlib/ucs4lib.h"
4670#include "stringlib/codecs.h"
4671#include "stringlib/undef.h"
4672
Antoine Pitrouab868312009-01-10 15:40:25 +00004673/* Mask to quickly check whether a C 'long' contains a
4674 non-ASCII, UTF8-encoded char. */
4675#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004676# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004677#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004678# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004679#else
4680# error C 'long' size should be either 4 or 8!
4681#endif
4682
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683static Py_ssize_t
4684ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004687 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004688
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004689 /*
4690 * Issue #17237: m68k is a bit different from most architectures in
4691 * that objects do not use "natural alignment" - for example, int and
4692 * long are only aligned at 2-byte boundaries. Therefore the assert()
4693 * won't work; also, tests have shown that skipping the "optimised
4694 * version" will even speed up m68k.
4695 */
4696#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004698 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4699 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004700 /* Fast path, see in STRINGLIB(utf8_decode) for
4701 an explanation. */
4702 /* Help register allocation */
4703 register const char *_p = p;
4704 register Py_UCS1 * q = dest;
4705 while (_p < aligned_end) {
4706 unsigned long value = *(const unsigned long *) _p;
4707 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004708 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004709 *((unsigned long *)q) = value;
4710 _p += SIZEOF_LONG;
4711 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004712 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713 p = _p;
4714 while (p < end) {
4715 if ((unsigned char)*p & 0x80)
4716 break;
4717 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004722#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 while (p < end) {
4724 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4725 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004726 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004727 /* Help register allocation */
4728 register const char *_p = p;
4729 while (_p < aligned_end) {
4730 unsigned long value = *(unsigned long *) _p;
4731 if (value & ASCII_CHAR_MASK)
4732 break;
4733 _p += SIZEOF_LONG;
4734 }
4735 p = _p;
4736 if (_p == end)
4737 break;
4738 }
4739 if ((unsigned char)*p & 0x80)
4740 break;
4741 ++p;
4742 }
4743 memcpy(dest, start, p - start);
4744 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745}
Antoine Pitrouab868312009-01-10 15:40:25 +00004746
Victor Stinner785938e2011-12-11 20:09:03 +01004747PyObject *
4748PyUnicode_DecodeUTF8Stateful(const char *s,
4749 Py_ssize_t size,
4750 const char *errors,
4751 Py_ssize_t *consumed)
4752{
Victor Stinner785938e2011-12-11 20:09:03 +01004753 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004754 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 const char *end = s + size;
4756 Py_ssize_t outpos;
4757
4758 Py_ssize_t startinpos;
4759 Py_ssize_t endinpos;
4760 const char *errmsg = "";
4761 PyObject *errorHandler = NULL;
4762 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004763
4764 if (size == 0) {
4765 if (consumed)
4766 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004767 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004768 }
4769
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4771 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004772 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 *consumed = 1;
4774 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004775 }
4776
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004778 if (!unicode)
4779 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004780
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004781 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4782 s += outpos;
4783 while (s < end) {
4784 Py_UCS4 ch;
4785 int kind = PyUnicode_KIND(unicode);
4786 if (kind == PyUnicode_1BYTE_KIND) {
4787 if (PyUnicode_IS_ASCII(unicode))
4788 ch = asciilib_utf8_decode(&s, end,
4789 PyUnicode_1BYTE_DATA(unicode), &outpos);
4790 else
4791 ch = ucs1lib_utf8_decode(&s, end,
4792 PyUnicode_1BYTE_DATA(unicode), &outpos);
4793 } else if (kind == PyUnicode_2BYTE_KIND) {
4794 ch = ucs2lib_utf8_decode(&s, end,
4795 PyUnicode_2BYTE_DATA(unicode), &outpos);
4796 } else {
4797 assert(kind == PyUnicode_4BYTE_KIND);
4798 ch = ucs4lib_utf8_decode(&s, end,
4799 PyUnicode_4BYTE_DATA(unicode), &outpos);
4800 }
4801
4802 switch (ch) {
4803 case 0:
4804 if (s == end || consumed)
4805 goto End;
4806 errmsg = "unexpected end of data";
4807 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004808 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 break;
4810 case 1:
4811 errmsg = "invalid start byte";
4812 startinpos = s - starts;
4813 endinpos = startinpos + 1;
4814 break;
4815 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004816 case 3:
4817 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 errmsg = "invalid continuation byte";
4819 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004820 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 break;
4822 default:
4823 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4824 goto onError;
4825 continue;
4826 }
4827
4828 if (unicode_decode_call_errorhandler(
4829 errors, &errorHandler,
4830 "utf-8", errmsg,
4831 &starts, &end, &startinpos, &endinpos, &exc, &s,
4832 &unicode, &outpos))
4833 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004834 }
4835
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836End:
4837 if (unicode_resize(&unicode, outpos) < 0)
4838 goto onError;
4839
4840 if (consumed)
4841 *consumed = s - starts;
4842
4843 Py_XDECREF(errorHandler);
4844 Py_XDECREF(exc);
4845 assert(_PyUnicode_CheckConsistency(unicode, 1));
4846 return unicode;
4847
4848onError:
4849 Py_XDECREF(errorHandler);
4850 Py_XDECREF(exc);
4851 Py_XDECREF(unicode);
4852 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004853}
4854
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004855#ifdef __APPLE__
4856
4857/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004858 used to decode the command line arguments on Mac OS X.
4859
4860 Return a pointer to a newly allocated wide character string (use
4861 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004862
4863wchar_t*
4864_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4865{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004866 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 wchar_t *unicode;
4868 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004869
4870 /* Note: size will always be longer than the resulting Unicode
4871 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004872 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004873 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004874 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4875 if (!unicode)
4876 return NULL;
4877
4878 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004879 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004881 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004883#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004885#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004887#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 if (ch > 0xFF) {
4889#if SIZEOF_WCHAR_T == 4
4890 assert(0);
4891#else
4892 assert(Py_UNICODE_IS_SURROGATE(ch));
4893 /* compute and append the two surrogates: */
4894 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4895 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4896#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004897 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898 else {
4899 if (!ch && s == e)
4900 break;
4901 /* surrogateescape */
4902 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4903 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004904 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004906 return unicode;
4907}
4908
4909#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004911/* Primary internal function which creates utf8 encoded bytes objects.
4912
4913 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004914 and allocate exactly as much space needed at the end. Else allocate the
4915 maximum possible needed (4 result bytes per Unicode character), and return
4916 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004917*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004918PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004919_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
Victor Stinner6099a032011-12-18 14:22:26 +01004921 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922 void *data;
4923 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925 if (!PyUnicode_Check(unicode)) {
4926 PyErr_BadArgument();
4927 return NULL;
4928 }
4929
4930 if (PyUnicode_READY(unicode) == -1)
4931 return NULL;
4932
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004933 if (PyUnicode_UTF8(unicode))
4934 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4935 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936
4937 kind = PyUnicode_KIND(unicode);
4938 data = PyUnicode_DATA(unicode);
4939 size = PyUnicode_GET_LENGTH(unicode);
4940
Benjamin Petersonead6b532011-12-20 17:23:42 -06004941 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004942 default:
4943 assert(0);
4944 case PyUnicode_1BYTE_KIND:
4945 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4946 assert(!PyUnicode_IS_ASCII(unicode));
4947 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4948 case PyUnicode_2BYTE_KIND:
4949 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4950 case PyUnicode_4BYTE_KIND:
4951 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953}
4954
Alexander Belopolsky40018472011-02-26 01:02:56 +00004955PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004956PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4957 Py_ssize_t size,
4958 const char *errors)
4959{
4960 PyObject *v, *unicode;
4961
4962 unicode = PyUnicode_FromUnicode(s, size);
4963 if (unicode == NULL)
4964 return NULL;
4965 v = _PyUnicode_AsUTF8String(unicode, errors);
4966 Py_DECREF(unicode);
4967 return v;
4968}
4969
4970PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004971PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004973 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974}
4975
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976/* --- UTF-32 Codec ------------------------------------------------------- */
4977
4978PyObject *
4979PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 Py_ssize_t size,
4981 const char *errors,
4982 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983{
4984 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4985}
4986
4987PyObject *
4988PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 Py_ssize_t size,
4990 const char *errors,
4991 int *byteorder,
4992 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993{
4994 const char *starts = s;
4995 Py_ssize_t startinpos;
4996 Py_ssize_t endinpos;
4997 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004998 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004999 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000 int bo = 0; /* assume native ordering by default */
5001 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 /* Offsets from q for retrieving bytes in the right order. */
5003#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5004 int iorder[] = {0, 1, 2, 3};
5005#else
5006 int iorder[] = {3, 2, 1, 0};
5007#endif
5008 PyObject *errorHandler = NULL;
5009 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005010
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 q = (unsigned char *)s;
5012 e = q + size;
5013
5014 if (byteorder)
5015 bo = *byteorder;
5016
5017 /* Check for BOM marks (U+FEFF) in the input and adjust current
5018 byte order setting accordingly. In native mode, the leading BOM
5019 mark is skipped, in all other modes, it is copied to the output
5020 stream as-is (giving a ZWNBSP character). */
5021 if (bo == 0) {
5022 if (size >= 4) {
5023 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 if (bom == 0x0000FEFF) {
5027 q += 4;
5028 bo = -1;
5029 }
5030 else if (bom == 0xFFFE0000) {
5031 q += 4;
5032 bo = 1;
5033 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 if (bom == 0x0000FEFF) {
5036 q += 4;
5037 bo = 1;
5038 }
5039 else if (bom == 0xFFFE0000) {
5040 q += 4;
5041 bo = -1;
5042 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045 }
5046
5047 if (bo == -1) {
5048 /* force LE */
5049 iorder[0] = 0;
5050 iorder[1] = 1;
5051 iorder[2] = 2;
5052 iorder[3] = 3;
5053 }
5054 else if (bo == 1) {
5055 /* force BE */
5056 iorder[0] = 3;
5057 iorder[1] = 2;
5058 iorder[2] = 1;
5059 iorder[3] = 0;
5060 }
5061
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005062 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005063 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005064 if (!unicode)
5065 return NULL;
5066 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005067 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005068 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005069
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 Py_UCS4 ch;
5072 /* remaining bytes at the end? (size should be divisible by 4) */
5073 if (e-q<4) {
5074 if (consumed)
5075 break;
5076 errmsg = "truncated data";
5077 startinpos = ((const char *)q)-starts;
5078 endinpos = ((const char *)e)-starts;
5079 goto utf32Error;
5080 /* The remaining input chars are ignored if the callback
5081 chooses to skip the input */
5082 }
5083 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5084 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 if (ch >= 0x110000)
5087 {
5088 errmsg = "codepoint not in range(0x110000)";
5089 startinpos = ((const char *)q)-starts;
5090 endinpos = startinpos+4;
5091 goto utf32Error;
5092 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005093 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5094 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 q += 4;
5096 continue;
5097 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 if (unicode_decode_call_errorhandler(
5099 errors, &errorHandler,
5100 "utf32", errmsg,
5101 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005102 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 }
5105
5106 if (byteorder)
5107 *byteorder = bo;
5108
5109 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111
5112 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005113 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 goto onError;
5115
5116 Py_XDECREF(errorHandler);
5117 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005118 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121 Py_DECREF(unicode);
5122 Py_XDECREF(errorHandler);
5123 Py_XDECREF(exc);
5124 return NULL;
5125}
5126
5127PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005128_PyUnicode_EncodeUTF32(PyObject *str,
5129 const char *errors,
5130 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005131{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005132 int kind;
5133 void *data;
5134 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005135 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005137 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 /* Offsets from p for storing byte pairs in the right order. */
5139#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5140 int iorder[] = {0, 1, 2, 3};
5141#else
5142 int iorder[] = {3, 2, 1, 0};
5143#endif
5144
Benjamin Peterson29060642009-01-31 22:14:21 +00005145#define STORECHAR(CH) \
5146 do { \
5147 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5148 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5149 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5150 p[iorder[0]] = (CH) & 0xff; \
5151 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152 } while(0)
5153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005154 if (!PyUnicode_Check(str)) {
5155 PyErr_BadArgument();
5156 return NULL;
5157 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005158 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005159 return NULL;
5160 kind = PyUnicode_KIND(str);
5161 data = PyUnicode_DATA(str);
5162 len = PyUnicode_GET_LENGTH(str);
5163
5164 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005165 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005167 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005168 if (v == NULL)
5169 return NULL;
5170
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005171 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005172 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005174 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005175 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005176
5177 if (byteorder == -1) {
5178 /* force LE */
5179 iorder[0] = 0;
5180 iorder[1] = 1;
5181 iorder[2] = 2;
5182 iorder[3] = 3;
5183 }
5184 else if (byteorder == 1) {
5185 /* force BE */
5186 iorder[0] = 3;
5187 iorder[1] = 2;
5188 iorder[2] = 1;
5189 iorder[3] = 0;
5190 }
5191
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005192 for (i = 0; i < len; i++)
5193 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005194
5195 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005196 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197#undef STORECHAR
5198}
5199
Alexander Belopolsky40018472011-02-26 01:02:56 +00005200PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005201PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5202 Py_ssize_t size,
5203 const char *errors,
5204 int byteorder)
5205{
5206 PyObject *result;
5207 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5208 if (tmp == NULL)
5209 return NULL;
5210 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5211 Py_DECREF(tmp);
5212 return result;
5213}
5214
5215PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005216PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005217{
Victor Stinnerb960b342011-11-20 19:12:52 +01005218 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219}
5220
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221/* --- UTF-16 Codec ------------------------------------------------------- */
5222
Tim Peters772747b2001-08-09 22:21:55 +00005223PyObject *
5224PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005225 Py_ssize_t size,
5226 const char *errors,
5227 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228{
Walter Dörwald69652032004-09-07 20:24:22 +00005229 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5230}
5231
5232PyObject *
5233PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 Py_ssize_t size,
5235 const char *errors,
5236 int *byteorder,
5237 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005238{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005239 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005240 Py_ssize_t startinpos;
5241 Py_ssize_t endinpos;
5242 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005243 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005245 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005247 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 PyObject *errorHandler = NULL;
5249 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250
Tim Peters772747b2001-08-09 22:21:55 +00005251 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005252 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
5254 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005255 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005257 /* Check for BOM marks (U+FEFF) in the input and adjust current
5258 byte order setting accordingly. In native mode, the leading BOM
5259 mark is skipped, in all other modes, it is copied to the output
5260 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005261 if (bo == 0 && size >= 2) {
5262 const Py_UCS4 bom = (q[1] << 8) | q[0];
5263 if (bom == 0xFEFF) {
5264 q += 2;
5265 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005267 else if (bom == 0xFFFE) {
5268 q += 2;
5269 bo = 1;
5270 }
5271 if (byteorder)
5272 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
Antoine Pitrou63065d72012-05-15 23:48:04 +02005275 if (q == e) {
5276 if (consumed)
5277 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005278 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005279 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005280
Antoine Pitrouab868312009-01-10 15:40:25 +00005281#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005283#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005284 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005285#endif
Tim Peters772747b2001-08-09 22:21:55 +00005286
Antoine Pitrou63065d72012-05-15 23:48:04 +02005287 /* Note: size will always be longer than the resulting Unicode
5288 character count */
5289 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5290 if (!unicode)
5291 return NULL;
5292
5293 outpos = 0;
5294 while (1) {
5295 Py_UCS4 ch = 0;
5296 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005297 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005298 if (kind == PyUnicode_1BYTE_KIND) {
5299 if (PyUnicode_IS_ASCII(unicode))
5300 ch = asciilib_utf16_decode(&q, e,
5301 PyUnicode_1BYTE_DATA(unicode), &outpos,
5302 native_ordering);
5303 else
5304 ch = ucs1lib_utf16_decode(&q, e,
5305 PyUnicode_1BYTE_DATA(unicode), &outpos,
5306 native_ordering);
5307 } else if (kind == PyUnicode_2BYTE_KIND) {
5308 ch = ucs2lib_utf16_decode(&q, e,
5309 PyUnicode_2BYTE_DATA(unicode), &outpos,
5310 native_ordering);
5311 } else {
5312 assert(kind == PyUnicode_4BYTE_KIND);
5313 ch = ucs4lib_utf16_decode(&q, e,
5314 PyUnicode_4BYTE_DATA(unicode), &outpos,
5315 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005316 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318
Antoine Pitrou63065d72012-05-15 23:48:04 +02005319 switch (ch)
5320 {
5321 case 0:
5322 /* remaining byte at the end? (size should be even) */
5323 if (q == e || consumed)
5324 goto End;
5325 errmsg = "truncated data";
5326 startinpos = ((const char *)q) - starts;
5327 endinpos = ((const char *)e) - starts;
5328 break;
5329 /* The remaining input chars are ignored if the callback
5330 chooses to skip the input */
5331 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005332 q -= 2;
5333 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005334 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005335 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005336 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337 endinpos = ((const char *)e) - starts;
5338 break;
5339 case 2:
5340 errmsg = "illegal encoding";
5341 startinpos = ((const char *)q) - 2 - starts;
5342 endinpos = startinpos + 2;
5343 break;
5344 case 3:
5345 errmsg = "illegal UTF-16 surrogate";
5346 startinpos = ((const char *)q) - 4 - starts;
5347 endinpos = startinpos + 2;
5348 break;
5349 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005350 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5351 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 continue;
5353 }
5354
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005356 errors,
5357 &errorHandler,
5358 "utf16", errmsg,
5359 &starts,
5360 (const char **)&e,
5361 &startinpos,
5362 &endinpos,
5363 &exc,
5364 (const char **)&q,
5365 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005366 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
5369
Antoine Pitrou63065d72012-05-15 23:48:04 +02005370End:
Walter Dörwald69652032004-09-07 20:24:22 +00005371 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005373
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005375 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 goto onError;
5377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 Py_XDECREF(errorHandler);
5379 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005380 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 Py_XDECREF(errorHandler);
5385 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 return NULL;
5387}
5388
Tim Peters772747b2001-08-09 22:21:55 +00005389PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390_PyUnicode_EncodeUTF16(PyObject *str,
5391 const char *errors,
5392 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005394 enum PyUnicode_Kind kind;
5395 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005397 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 unsigned short *out;
5399 Py_ssize_t bytesize;
5400 Py_ssize_t pairs;
5401#ifdef WORDS_BIGENDIAN
5402 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005403#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005404 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005405#endif
5406
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 if (!PyUnicode_Check(str)) {
5408 PyErr_BadArgument();
5409 return NULL;
5410 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005411 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005412 return NULL;
5413 kind = PyUnicode_KIND(str);
5414 data = PyUnicode_DATA(str);
5415 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005416
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005418 if (kind == PyUnicode_4BYTE_KIND) {
5419 const Py_UCS4 *in = (const Py_UCS4 *)data;
5420 const Py_UCS4 *end = in + len;
5421 while (in < end)
5422 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005423 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005424 }
5425 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005427 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005428 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 if (v == NULL)
5430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005433 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005437 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005438 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005439
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005440 switch (kind) {
5441 case PyUnicode_1BYTE_KIND: {
5442 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5443 break;
Tim Peters772747b2001-08-09 22:21:55 +00005444 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005445 case PyUnicode_2BYTE_KIND: {
5446 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5447 break;
Tim Peters772747b2001-08-09 22:21:55 +00005448 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005449 case PyUnicode_4BYTE_KIND: {
5450 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5451 break;
5452 }
5453 default:
5454 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005455 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005456
5457 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005458 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459}
5460
Alexander Belopolsky40018472011-02-26 01:02:56 +00005461PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005462PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5463 Py_ssize_t size,
5464 const char *errors,
5465 int byteorder)
5466{
5467 PyObject *result;
5468 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5469 if (tmp == NULL)
5470 return NULL;
5471 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5472 Py_DECREF(tmp);
5473 return result;
5474}
5475
5476PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005477PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480}
5481
5482/* --- Unicode Escape Codec ----------------------------------------------- */
5483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005484/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5485 if all the escapes in the string make it still a valid ASCII string.
5486 Returns -1 if any escapes were found which cause the string to
5487 pop out of ASCII range. Otherwise returns the length of the
5488 required buffer to hold the string.
5489 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005490static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005491length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5492{
5493 const unsigned char *p = (const unsigned char *)s;
5494 const unsigned char *end = p + size;
5495 Py_ssize_t length = 0;
5496
5497 if (size < 0)
5498 return -1;
5499
5500 for (; p < end; ++p) {
5501 if (*p > 127) {
5502 /* Non-ASCII */
5503 return -1;
5504 }
5505 else if (*p != '\\') {
5506 /* Normal character */
5507 ++length;
5508 }
5509 else {
5510 /* Backslash-escape, check next char */
5511 ++p;
5512 /* Escape sequence reaches till end of string or
5513 non-ASCII follow-up. */
5514 if (p >= end || *p > 127)
5515 return -1;
5516 switch (*p) {
5517 case '\n':
5518 /* backslash + \n result in zero characters */
5519 break;
5520 case '\\': case '\'': case '\"':
5521 case 'b': case 'f': case 't':
5522 case 'n': case 'r': case 'v': case 'a':
5523 ++length;
5524 break;
5525 case '0': case '1': case '2': case '3':
5526 case '4': case '5': case '6': case '7':
5527 case 'x': case 'u': case 'U': case 'N':
5528 /* these do not guarantee ASCII characters */
5529 return -1;
5530 default:
5531 /* count the backslash + the other character */
5532 length += 2;
5533 }
5534 }
5535 }
5536 return length;
5537}
5538
Fredrik Lundh06d12682001-01-24 07:59:11 +00005539static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005540
Alexander Belopolsky40018472011-02-26 01:02:56 +00005541PyObject *
5542PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005543 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005547 Py_ssize_t startinpos;
5548 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005549 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005551 char* message;
5552 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 PyObject *errorHandler = NULL;
5554 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005555 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005556 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005557
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005558 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005559
5560 /* After length_of_escaped_ascii_string() there are two alternatives,
5561 either the string is pure ASCII with named escapes like \n, etc.
5562 and we determined it's exact size (common case)
5563 or it contains \x, \u, ... escape sequences. then we create a
5564 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005565 if (len >= 0) {
5566 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 if (!v)
5568 goto onError;
5569 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 }
5571 else {
5572 /* Escaped strings will always be longer than the resulting
5573 Unicode string, so we start with size here and then reduce the
5574 length after conversion to the true value.
5575 (but if the error callback returns a long replacement string
5576 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578 if (!v)
5579 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005581 }
5582
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005584 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005587
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 while (s < end) {
5589 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005590 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005593 /* The only case in which i == ascii_length is a backslash
5594 followed by a newline. */
5595 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 /* Non-escape characters are interpreted as Unicode ordinals */
5598 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5600 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 continue;
5602 }
5603
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 /* \ - Escapes */
5606 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005607 c = *s++;
5608 if (s > end)
5609 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005611 /* The only case in which i == ascii_length is a backslash
5612 followed by a newline. */
5613 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005614
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005615 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005618#define WRITECHAR(ch) \
5619 do { \
5620 if (unicode_putchar(&v, &i, ch) < 0) \
5621 goto onError; \
5622 }while(0)
5623
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005625 case '\\': WRITECHAR('\\'); break;
5626 case '\'': WRITECHAR('\''); break;
5627 case '\"': WRITECHAR('\"'); break;
5628 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005630 case 'f': WRITECHAR('\014'); break;
5631 case 't': WRITECHAR('\t'); break;
5632 case 'n': WRITECHAR('\n'); break;
5633 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005634 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005635 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 case '0': case '1': case '2': case '3':
5641 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005642 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005643 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005644 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005645 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005646 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005648 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 break;
5650
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 /* hex escapes */
5652 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654 digits = 2;
5655 message = "truncated \\xXX escape";
5656 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005660 digits = 4;
5661 message = "truncated \\uXXXX escape";
5662 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005665 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005666 digits = 8;
5667 message = "truncated \\UXXXXXXXX escape";
5668 hexescape:
5669 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005670 if (end - s < digits) {
5671 /* count only hex digits */
5672 for (; s < end; ++s) {
5673 c = (unsigned char)*s;
5674 if (!Py_ISXDIGIT(c))
5675 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005676 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005677 goto error;
5678 }
5679 for (; digits--; ++s) {
5680 c = (unsigned char)*s;
5681 if (!Py_ISXDIGIT(c))
5682 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 chr = (chr<<4) & ~0xF;
5684 if (c >= '0' && c <= '9')
5685 chr += c - '0';
5686 else if (c >= 'a' && c <= 'f')
5687 chr += 10 + c - 'a';
5688 else
5689 chr += 10 + c - 'A';
5690 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005691 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 /* _decoding_error will have already written into the
5693 target buffer. */
5694 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005695 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005696 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005697 message = "illegal Unicode character";
5698 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005699 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005700 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005701 break;
5702
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005704 case 'N':
5705 message = "malformed \\N character escape";
5706 if (ucnhash_CAPI == NULL) {
5707 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005708 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5709 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005710 if (ucnhash_CAPI == NULL)
5711 goto ucnhashError;
5712 }
5713 if (*s == '{') {
5714 const char *start = s+1;
5715 /* look for the closing brace */
5716 while (*s != '}' && s < end)
5717 s++;
5718 if (s > start && s < end && *s == '}') {
5719 /* found a name. look it up in the unicode database */
5720 message = "unknown Unicode character name";
5721 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005722 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005723 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005724 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 goto store;
5726 }
5727 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005728 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005729
5730 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005731 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 message = "\\ at end of string";
5733 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005734 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005735 }
5736 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005737 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005738 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005739 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005740 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005742 continue;
5743
5744 error:
5745 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005746 if (unicode_decode_call_errorhandler(
5747 errors, &errorHandler,
5748 "unicodeescape", message,
5749 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005750 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005751 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005752 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005753 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005755#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005756
Victor Stinner16e6a802011-12-12 13:24:15 +01005757 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005758 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005759 Py_XDECREF(errorHandler);
5760 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005761 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005762
Benjamin Peterson29060642009-01-31 22:14:21 +00005763 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005764 PyErr_SetString(
5765 PyExc_UnicodeError,
5766 "\\N escapes not supported (can't load unicodedata module)"
5767 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005768 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 Py_XDECREF(errorHandler);
5770 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005771 return NULL;
5772
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 Py_XDECREF(errorHandler);
5776 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 return NULL;
5778}
5779
5780/* Return a Unicode-Escape string version of the Unicode object.
5781
5782 If quotes is true, the string is enclosed in u"" or u'' quotes as
5783 appropriate.
5784
5785*/
5786
Alexander Belopolsky40018472011-02-26 01:02:56 +00005787PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005788PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005791 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005793 int kind;
5794 void *data;
5795 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Ezio Melottie7f90372012-10-05 03:33:31 +03005797 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005798 escape.
5799
Ezio Melottie7f90372012-10-05 03:33:31 +03005800 For UCS1 strings it's '\xxx', 4 bytes per source character.
5801 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5802 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005803 */
5804
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005805 if (!PyUnicode_Check(unicode)) {
5806 PyErr_BadArgument();
5807 return NULL;
5808 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005809 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005810 return NULL;
5811 len = PyUnicode_GET_LENGTH(unicode);
5812 kind = PyUnicode_KIND(unicode);
5813 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005814 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005815 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5816 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5817 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5818 }
5819
5820 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005821 return PyBytes_FromStringAndSize(NULL, 0);
5822
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005823 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005825
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005826 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005828 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 if (repr == NULL)
5831 return NULL;
5832
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005833 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005835 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005836 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005837
Walter Dörwald79e913e2007-05-12 11:08:06 +00005838 /* Escape backslashes */
5839 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 *p++ = '\\';
5841 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005842 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005843 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005844
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005845 /* Map 21-bit characters to '\U00xxxxxx' */
5846 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005847 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005848 *p++ = '\\';
5849 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005850 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5851 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5852 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5853 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5854 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5855 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5856 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5857 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005859 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005862 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 *p++ = '\\';
5864 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005865 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5866 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5867 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5868 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005870
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005871 /* Map special whitespace to '\t', \n', '\r' */
5872 else if (ch == '\t') {
5873 *p++ = '\\';
5874 *p++ = 't';
5875 }
5876 else if (ch == '\n') {
5877 *p++ = '\\';
5878 *p++ = 'n';
5879 }
5880 else if (ch == '\r') {
5881 *p++ = '\\';
5882 *p++ = 'r';
5883 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005884
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005885 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005886 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005888 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005889 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5890 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005891 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005892
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 /* Copy everything else as-is */
5894 else
5895 *p++ = (char) ch;
5896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005898 assert(p - PyBytes_AS_STRING(repr) > 0);
5899 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5900 return NULL;
5901 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902}
5903
Alexander Belopolsky40018472011-02-26 01:02:56 +00005904PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005905PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5906 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 PyObject *result;
5909 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5910 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 result = PyUnicode_AsUnicodeEscapeString(tmp);
5913 Py_DECREF(tmp);
5914 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915}
5916
5917/* --- Raw Unicode Escape Codec ------------------------------------------- */
5918
Alexander Belopolsky40018472011-02-26 01:02:56 +00005919PyObject *
5920PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005921 Py_ssize_t size,
5922 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005925 Py_ssize_t startinpos;
5926 Py_ssize_t endinpos;
5927 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005928 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 const char *end;
5930 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 PyObject *errorHandler = NULL;
5932 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005933
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 /* Escaped strings will always be longer than the resulting
5935 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 length after conversion to the true value. (But decoding error
5937 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005938 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005942 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005943 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 end = s + size;
5945 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 unsigned char c;
5947 Py_UCS4 x;
5948 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005949 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* Non-escape characters are interpreted as Unicode ordinals */
5952 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005953 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5954 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005956 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 startinpos = s-starts;
5958
5959 /* \u-escapes are only interpreted iff the number of leading
5960 backslashes if odd */
5961 bs = s;
5962 for (;s < end;) {
5963 if (*s != '\\')
5964 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005965 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5966 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 }
5968 if (((s - bs) & 1) == 0 ||
5969 s >= end ||
5970 (*s != 'u' && *s != 'U')) {
5971 continue;
5972 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005973 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 count = *s=='u' ? 4 : 8;
5975 s++;
5976
5977 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 for (x = 0, i = 0; i < count; ++i, ++s) {
5979 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005980 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 endinpos = s-starts;
5982 if (unicode_decode_call_errorhandler(
5983 errors, &errorHandler,
5984 "rawunicodeescape", "truncated \\uXXXX",
5985 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005986 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 goto onError;
5988 goto nextByte;
5989 }
5990 x = (x<<4) & ~0xF;
5991 if (c >= '0' && c <= '9')
5992 x += c - '0';
5993 else if (c >= 'a' && c <= 'f')
5994 x += 10 + c - 'a';
5995 else
5996 x += 10 + c - 'A';
5997 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005998 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005999 if (unicode_putchar(&v, &outpos, x) < 0)
6000 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006001 } else {
6002 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006003 if (unicode_decode_call_errorhandler(
6004 errors, &errorHandler,
6005 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006009 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 nextByte:
6011 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006013 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 Py_XDECREF(errorHandler);
6016 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006017 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006018
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 Py_XDECREF(errorHandler);
6022 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return NULL;
6024}
6025
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026
Alexander Belopolsky40018472011-02-26 01:02:56 +00006027PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006030 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 char *p;
6032 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033 Py_ssize_t expandsize, pos;
6034 int kind;
6035 void *data;
6036 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 if (!PyUnicode_Check(unicode)) {
6039 PyErr_BadArgument();
6040 return NULL;
6041 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006042 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006043 return NULL;
6044 kind = PyUnicode_KIND(unicode);
6045 data = PyUnicode_DATA(unicode);
6046 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006047 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6048 bytes, and 1 byte characters 4. */
6049 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006050
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006051 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006053
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 if (repr == NULL)
6056 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006058 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006060 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006061 for (pos = 0; pos < len; pos++) {
6062 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 /* Map 32-bit characters to '\Uxxxxxxxx' */
6064 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006065 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006066 *p++ = '\\';
6067 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006068 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6069 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6070 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6071 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6072 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6073 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6074 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6075 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006076 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006078 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 *p++ = '\\';
6080 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006081 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6082 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6083 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6084 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 /* Copy everything else as-is */
6087 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 *p++ = (char) ch;
6089 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006090
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006091 assert(p > q);
6092 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006093 return NULL;
6094 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095}
6096
Alexander Belopolsky40018472011-02-26 01:02:56 +00006097PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006098PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6099 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101 PyObject *result;
6102 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6103 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006104 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006105 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6106 Py_DECREF(tmp);
6107 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108}
6109
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006110/* --- Unicode Internal Codec ------------------------------------------- */
6111
Alexander Belopolsky40018472011-02-26 01:02:56 +00006112PyObject *
6113_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006114 Py_ssize_t size,
6115 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006116{
6117 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006118 Py_ssize_t startinpos;
6119 Py_ssize_t endinpos;
6120 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006121 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006122 const char *end;
6123 const char *reason;
6124 PyObject *errorHandler = NULL;
6125 PyObject *exc = NULL;
6126
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006127 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006128 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006129 1))
6130 return NULL;
6131
Thomas Wouters89f507f2006-12-13 04:49:30 +00006132 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006133 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006134 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006136 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006137 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006139 end = s + size;
6140
6141 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006142 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006143 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006144 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006145 endinpos = end-starts;
6146 reason = "truncated input";
6147 goto error;
6148 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006149 /* We copy the raw representation one byte at a time because the
6150 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006151 ((char *) &uch)[0] = s[0];
6152 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006153#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006154 ((char *) &uch)[2] = s[2];
6155 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006156#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006157 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006158#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006159 /* We have to sanity check the raw data, otherwise doom looms for
6160 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006161 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006162 endinpos = s - starts + Py_UNICODE_SIZE;
6163 reason = "illegal code point (> 0x10FFFF)";
6164 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006165 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006166#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006167 s += Py_UNICODE_SIZE;
6168#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006169 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006170 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006171 Py_UNICODE uch2;
6172 ((char *) &uch2)[0] = s[0];
6173 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006174 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006175 {
Victor Stinner551ac952011-11-29 22:58:13 +01006176 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006177 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006178 }
6179 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006180#endif
6181
6182 if (unicode_putchar(&v, &outpos, ch) < 0)
6183 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006184 continue;
6185
6186 error:
6187 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006188 if (unicode_decode_call_errorhandler(
6189 errors, &errorHandler,
6190 "unicode_internal", reason,
6191 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006192 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006193 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006194 }
6195
Victor Stinner16e6a802011-12-12 13:24:15 +01006196 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006197 goto onError;
6198 Py_XDECREF(errorHandler);
6199 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006200 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203 Py_XDECREF(v);
6204 Py_XDECREF(errorHandler);
6205 Py_XDECREF(exc);
6206 return NULL;
6207}
6208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209/* --- Latin-1 Codec ------------------------------------------------------ */
6210
Alexander Belopolsky40018472011-02-26 01:02:56 +00006211PyObject *
6212PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006213 Py_ssize_t size,
6214 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006217 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006221static void
6222make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006223 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006224 PyObject *unicode,
6225 Py_ssize_t startpos, Py_ssize_t endpos,
6226 const char *reason)
6227{
6228 if (*exceptionObject == NULL) {
6229 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006230 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006231 encoding, unicode, startpos, endpos, reason);
6232 }
6233 else {
6234 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6235 goto onError;
6236 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6237 goto onError;
6238 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6239 goto onError;
6240 return;
6241 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006242 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006243 }
6244}
6245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006247static void
6248raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006249 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006250 PyObject *unicode,
6251 Py_ssize_t startpos, Py_ssize_t endpos,
6252 const char *reason)
6253{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006254 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006255 encoding, unicode, startpos, endpos, reason);
6256 if (*exceptionObject != NULL)
6257 PyCodec_StrictErrors(*exceptionObject);
6258}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259
6260/* error handling callback helper:
6261 build arguments, call the callback and check the arguments,
6262 put the result into newpos and return the replacement string, which
6263 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264static PyObject *
6265unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006266 PyObject **errorHandler,
6267 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006268 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006269 Py_ssize_t startpos, Py_ssize_t endpos,
6270 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006272 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006273 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 PyObject *restuple;
6275 PyObject *resunicode;
6276
6277 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 }
6282
Benjamin Petersonbac79492012-01-14 13:34:47 -05006283 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006284 return NULL;
6285 len = PyUnicode_GET_LENGTH(unicode);
6286
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006287 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006288 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291
6292 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006297 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 Py_DECREF(restuple);
6299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006301 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 &resunicode, newpos)) {
6303 Py_DECREF(restuple);
6304 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006306 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6307 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6308 Py_DECREF(restuple);
6309 return NULL;
6310 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006312 *newpos = len + *newpos;
6313 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6315 Py_DECREF(restuple);
6316 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 Py_INCREF(resunicode);
6319 Py_DECREF(restuple);
6320 return resunicode;
6321}
6322
Alexander Belopolsky40018472011-02-26 01:02:56 +00006323static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006324unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006325 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006326 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006327{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 /* input state */
6329 Py_ssize_t pos=0, size;
6330 int kind;
6331 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332 /* output object */
6333 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 /* pointer into the output */
6335 char *str;
6336 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006338 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6339 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340 PyObject *errorHandler = NULL;
6341 PyObject *exc = NULL;
6342 /* the following variable is used for caching string comparisons
6343 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6344 int known_errorHandler = -1;
6345
Benjamin Petersonbac79492012-01-14 13:34:47 -05006346 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 return NULL;
6348 size = PyUnicode_GET_LENGTH(unicode);
6349 kind = PyUnicode_KIND(unicode);
6350 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 /* allocate enough for a simple encoding without
6352 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006353 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006354 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006355 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006357 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006358 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 ressize = size;
6360
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 while (pos < size) {
6362 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 /* can we encode this? */
6365 if (c<limit) {
6366 /* no overflow check, because we know that the space is enough */
6367 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 Py_ssize_t requiredsize;
6372 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006375 Py_ssize_t collstart = pos;
6376 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006378 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 ++collend;
6380 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6381 if (known_errorHandler==-1) {
6382 if ((errors==NULL) || (!strcmp(errors, "strict")))
6383 known_errorHandler = 1;
6384 else if (!strcmp(errors, "replace"))
6385 known_errorHandler = 2;
6386 else if (!strcmp(errors, "ignore"))
6387 known_errorHandler = 3;
6388 else if (!strcmp(errors, "xmlcharrefreplace"))
6389 known_errorHandler = 4;
6390 else
6391 known_errorHandler = 0;
6392 }
6393 switch (known_errorHandler) {
6394 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006395 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 goto onError;
6397 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006398 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 *str++ = '?'; /* fall through */
6400 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 break;
6403 case 4: /* xmlcharrefreplace */
6404 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006405 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006407 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006409 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006410 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006411 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006413 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006415 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006417 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006419 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006421 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006422 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006423 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006424 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006425 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006426 if (requiredsize > PY_SSIZE_T_MAX - incr)
6427 goto overflow;
6428 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006430 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6431 goto overflow;
6432 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006434 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 requiredsize = 2*ressize;
6436 if (_PyBytes_Resize(&res, requiredsize))
6437 goto onError;
6438 str = PyBytes_AS_STRING(res) + respos;
6439 ressize = requiredsize;
6440 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006441 /* generate replacement */
6442 for (i = collstart; i < collend; ++i) {
6443 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006445 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 break;
6447 default:
6448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449 encoding, reason, unicode, &exc,
6450 collstart, collend, &newpos);
6451 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006452 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006454 if (PyBytes_Check(repunicode)) {
6455 /* Directly copy bytes result to output. */
6456 repsize = PyBytes_Size(repunicode);
6457 if (repsize > 1) {
6458 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006459 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006460 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6461 Py_DECREF(repunicode);
6462 goto overflow;
6463 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006464 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6465 Py_DECREF(repunicode);
6466 goto onError;
6467 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006468 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006469 ressize += repsize-1;
6470 }
6471 memcpy(str, PyBytes_AsString(repunicode), repsize);
6472 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006474 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006475 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006476 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 /* need more space? (at least enough for what we
6478 have+the replacement+the rest of the string, so
6479 we won't have to check space for encodable characters) */
6480 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 requiredsize = respos;
6483 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6484 goto overflow;
6485 requiredsize += repsize;
6486 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6487 goto overflow;
6488 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 requiredsize = 2*ressize;
6492 if (_PyBytes_Resize(&res, requiredsize)) {
6493 Py_DECREF(repunicode);
6494 goto onError;
6495 }
6496 str = PyBytes_AS_STRING(res) + respos;
6497 ressize = requiredsize;
6498 }
6499 /* check if there is anything unencodable in the replacement
6500 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 for (i = 0; repsize-->0; ++i, ++str) {
6502 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006504 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 Py_DECREF(repunicode);
6507 goto onError;
6508 }
6509 *str = (char)c;
6510 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006512 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006514 }
6515 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006516 /* Resize if we allocated to much */
6517 size = str - PyBytes_AS_STRING(res);
6518 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006519 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006520 if (_PyBytes_Resize(&res, size) < 0)
6521 goto onError;
6522 }
6523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 Py_XDECREF(errorHandler);
6525 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006526 return res;
6527
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006528 overflow:
6529 PyErr_SetString(PyExc_OverflowError,
6530 "encoded result is too long for a Python string");
6531
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006532 onError:
6533 Py_XDECREF(res);
6534 Py_XDECREF(errorHandler);
6535 Py_XDECREF(exc);
6536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537}
6538
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006540PyObject *
6541PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006542 Py_ssize_t size,
6543 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 PyObject *result;
6546 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6547 if (unicode == NULL)
6548 return NULL;
6549 result = unicode_encode_ucs1(unicode, errors, 256);
6550 Py_DECREF(unicode);
6551 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
Alexander Belopolsky40018472011-02-26 01:02:56 +00006554PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006555_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556{
6557 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 PyErr_BadArgument();
6559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006561 if (PyUnicode_READY(unicode) == -1)
6562 return NULL;
6563 /* Fast path: if it is a one-byte string, construct
6564 bytes object directly. */
6565 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6566 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6567 PyUnicode_GET_LENGTH(unicode));
6568 /* Non-Latin-1 characters present. Defer to above function to
6569 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006570 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006571}
6572
6573PyObject*
6574PyUnicode_AsLatin1String(PyObject *unicode)
6575{
6576 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577}
6578
6579/* --- 7-bit ASCII Codec -------------------------------------------------- */
6580
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581PyObject *
6582PyUnicode_DecodeASCII(const char *s,
6583 Py_ssize_t size,
6584 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006586 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006587 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006588 int kind;
6589 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006590 Py_ssize_t startinpos;
6591 Py_ssize_t endinpos;
6592 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593 const char *e;
6594 PyObject *errorHandler = NULL;
6595 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006596
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006598 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006601 if (size == 1 && (unsigned char)s[0] < 128)
6602 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006603
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006604 unicode = PyUnicode_New(size, 127);
6605 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006609 data = PyUnicode_1BYTE_DATA(unicode);
6610 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6611 if (outpos == size)
6612 return unicode;
6613
6614 s += outpos;
6615 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 register unsigned char c = (unsigned char)*s;
6618 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006619 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 ++s;
6621 }
6622 else {
6623 startinpos = s-starts;
6624 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 if (unicode_decode_call_errorhandler(
6626 errors, &errorHandler,
6627 "ascii", "ordinal not in range(128)",
6628 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006629 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006631 kind = PyUnicode_KIND(unicode);
6632 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006635 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637 Py_XDECREF(errorHandler);
6638 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006639 assert(_PyUnicode_CheckConsistency(unicode, 1));
6640 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006641
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006643 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 Py_XDECREF(errorHandler);
6645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 return NULL;
6647}
6648
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650PyObject *
6651PyUnicode_EncodeASCII(const Py_UNICODE *p,
6652 Py_ssize_t size,
6653 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 PyObject *result;
6656 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6657 if (unicode == NULL)
6658 return NULL;
6659 result = unicode_encode_ucs1(unicode, errors, 128);
6660 Py_DECREF(unicode);
6661 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
6667 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 PyErr_BadArgument();
6669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671 if (PyUnicode_READY(unicode) == -1)
6672 return NULL;
6673 /* Fast path: if it is an ASCII-only string, construct bytes object
6674 directly. Else defer to above function to raise the exception. */
6675 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6676 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6677 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006679}
6680
6681PyObject *
6682PyUnicode_AsASCIIString(PyObject *unicode)
6683{
6684 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Victor Stinner99b95382011-07-04 14:23:54 +02006687#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006688
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006689/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006690
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006691#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692#define NEED_RETRY
6693#endif
6694
Victor Stinner3a50e702011-10-18 21:21:00 +02006695#ifndef WC_ERR_INVALID_CHARS
6696# define WC_ERR_INVALID_CHARS 0x0080
6697#endif
6698
6699static char*
6700code_page_name(UINT code_page, PyObject **obj)
6701{
6702 *obj = NULL;
6703 if (code_page == CP_ACP)
6704 return "mbcs";
6705 if (code_page == CP_UTF7)
6706 return "CP_UTF7";
6707 if (code_page == CP_UTF8)
6708 return "CP_UTF8";
6709
6710 *obj = PyBytes_FromFormat("cp%u", code_page);
6711 if (*obj == NULL)
6712 return NULL;
6713 return PyBytes_AS_STRING(*obj);
6714}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006717is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718{
6719 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006720 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006721
Victor Stinner3a50e702011-10-18 21:21:00 +02006722 if (!IsDBCSLeadByteEx(code_page, *curr))
6723 return 0;
6724
6725 prev = CharPrevExA(code_page, s, curr, 0);
6726 if (prev == curr)
6727 return 1;
6728 /* FIXME: This code is limited to "true" double-byte encodings,
6729 as it assumes an incomplete character consists of a single
6730 byte. */
6731 if (curr - prev == 2)
6732 return 1;
6733 if (!IsDBCSLeadByteEx(code_page, *prev))
6734 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006735 return 0;
6736}
6737
Victor Stinner3a50e702011-10-18 21:21:00 +02006738static DWORD
6739decode_code_page_flags(UINT code_page)
6740{
6741 if (code_page == CP_UTF7) {
6742 /* The CP_UTF7 decoder only supports flags=0 */
6743 return 0;
6744 }
6745 else
6746 return MB_ERR_INVALID_CHARS;
6747}
6748
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006749/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006750 * Decode a byte string from a Windows code page into unicode object in strict
6751 * mode.
6752 *
6753 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6754 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006755 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006756static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006757decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006758 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006759 const char *in,
6760 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761{
Victor Stinner3a50e702011-10-18 21:21:00 +02006762 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006763 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006764 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765
6766 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006767 assert(insize > 0);
6768 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6769 if (outsize <= 0)
6770 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771
6772 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006774 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006775 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 if (*v == NULL)
6777 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006778 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779 }
6780 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006782 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006783 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006785 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006786 }
6787
6788 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006789 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6790 if (outsize <= 0)
6791 goto error;
6792 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006793
Victor Stinner3a50e702011-10-18 21:21:00 +02006794error:
6795 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6796 return -2;
6797 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006798 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006799}
6800
Victor Stinner3a50e702011-10-18 21:21:00 +02006801/*
6802 * Decode a byte string from a code page into unicode object with an error
6803 * handler.
6804 *
6805 * Returns consumed size if succeed, or raise a WindowsError or
6806 * UnicodeDecodeError exception and returns -1 on error.
6807 */
6808static int
6809decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006810 PyObject **v,
6811 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006812 const char *errors)
6813{
6814 const char *startin = in;
6815 const char *endin = in + size;
6816 const DWORD flags = decode_code_page_flags(code_page);
6817 /* Ideally, we should get reason from FormatMessage. This is the Windows
6818 2000 English version of the message. */
6819 const char *reason = "No mapping for the Unicode character exists "
6820 "in the target code page.";
6821 /* each step cannot decode more than 1 character, but a character can be
6822 represented as a surrogate pair */
6823 wchar_t buffer[2], *startout, *out;
6824 int insize, outsize;
6825 PyObject *errorHandler = NULL;
6826 PyObject *exc = NULL;
6827 PyObject *encoding_obj = NULL;
6828 char *encoding;
6829 DWORD err;
6830 int ret = -1;
6831
6832 assert(size > 0);
6833
6834 encoding = code_page_name(code_page, &encoding_obj);
6835 if (encoding == NULL)
6836 return -1;
6837
6838 if (errors == NULL || strcmp(errors, "strict") == 0) {
6839 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6840 UnicodeDecodeError. */
6841 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6842 if (exc != NULL) {
6843 PyCodec_StrictErrors(exc);
6844 Py_CLEAR(exc);
6845 }
6846 goto error;
6847 }
6848
6849 if (*v == NULL) {
6850 /* Create unicode object */
6851 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6852 PyErr_NoMemory();
6853 goto error;
6854 }
Victor Stinnerab595942011-12-17 04:59:06 +01006855 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006856 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 if (*v == NULL)
6858 goto error;
6859 startout = PyUnicode_AS_UNICODE(*v);
6860 }
6861 else {
6862 /* Extend unicode object */
6863 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6864 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6865 PyErr_NoMemory();
6866 goto error;
6867 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006868 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 goto error;
6870 startout = PyUnicode_AS_UNICODE(*v) + n;
6871 }
6872
6873 /* Decode the byte string character per character */
6874 out = startout;
6875 while (in < endin)
6876 {
6877 /* Decode a character */
6878 insize = 1;
6879 do
6880 {
6881 outsize = MultiByteToWideChar(code_page, flags,
6882 in, insize,
6883 buffer, Py_ARRAY_LENGTH(buffer));
6884 if (outsize > 0)
6885 break;
6886 err = GetLastError();
6887 if (err != ERROR_NO_UNICODE_TRANSLATION
6888 && err != ERROR_INSUFFICIENT_BUFFER)
6889 {
6890 PyErr_SetFromWindowsErr(0);
6891 goto error;
6892 }
6893 insize++;
6894 }
6895 /* 4=maximum length of a UTF-8 sequence */
6896 while (insize <= 4 && (in + insize) <= endin);
6897
6898 if (outsize <= 0) {
6899 Py_ssize_t startinpos, endinpos, outpos;
6900
6901 startinpos = in - startin;
6902 endinpos = startinpos + 1;
6903 outpos = out - PyUnicode_AS_UNICODE(*v);
6904 if (unicode_decode_call_errorhandler(
6905 errors, &errorHandler,
6906 encoding, reason,
6907 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006908 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 {
6910 goto error;
6911 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006912 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 }
6914 else {
6915 in += insize;
6916 memcpy(out, buffer, outsize * sizeof(wchar_t));
6917 out += outsize;
6918 }
6919 }
6920
6921 /* write a NUL character at the end */
6922 *out = 0;
6923
6924 /* Extend unicode object */
6925 outsize = out - startout;
6926 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006927 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006929 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006930
6931error:
6932 Py_XDECREF(encoding_obj);
6933 Py_XDECREF(errorHandler);
6934 Py_XDECREF(exc);
6935 return ret;
6936}
6937
Victor Stinner3a50e702011-10-18 21:21:00 +02006938static PyObject *
6939decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006940 const char *s, Py_ssize_t size,
6941 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942{
Victor Stinner76a31a62011-11-04 00:05:13 +01006943 PyObject *v = NULL;
6944 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006945
Victor Stinner3a50e702011-10-18 21:21:00 +02006946 if (code_page < 0) {
6947 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6948 return NULL;
6949 }
6950
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953
Victor Stinner76a31a62011-11-04 00:05:13 +01006954 do
6955 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006957 if (size > INT_MAX) {
6958 chunk_size = INT_MAX;
6959 final = 0;
6960 done = 0;
6961 }
6962 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006964 {
6965 chunk_size = (int)size;
6966 final = (consumed == NULL);
6967 done = 1;
6968 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969
Victor Stinner76a31a62011-11-04 00:05:13 +01006970 /* Skip trailing lead-byte unless 'final' is set */
6971 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6972 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973
Victor Stinner76a31a62011-11-04 00:05:13 +01006974 if (chunk_size == 0 && done) {
6975 if (v != NULL)
6976 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006977 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006978 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979
Victor Stinner76a31a62011-11-04 00:05:13 +01006980
6981 converted = decode_code_page_strict(code_page, &v,
6982 s, chunk_size);
6983 if (converted == -2)
6984 converted = decode_code_page_errors(code_page, &v,
6985 s, chunk_size,
6986 errors);
6987 assert(converted != 0);
6988
6989 if (converted < 0) {
6990 Py_XDECREF(v);
6991 return NULL;
6992 }
6993
6994 if (consumed)
6995 *consumed += converted;
6996
6997 s += converted;
6998 size -= converted;
6999 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007000
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007001 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002}
7003
Alexander Belopolsky40018472011-02-26 01:02:56 +00007004PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007005PyUnicode_DecodeCodePageStateful(int code_page,
7006 const char *s,
7007 Py_ssize_t size,
7008 const char *errors,
7009 Py_ssize_t *consumed)
7010{
7011 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7012}
7013
7014PyObject *
7015PyUnicode_DecodeMBCSStateful(const char *s,
7016 Py_ssize_t size,
7017 const char *errors,
7018 Py_ssize_t *consumed)
7019{
7020 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7021}
7022
7023PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024PyUnicode_DecodeMBCS(const char *s,
7025 Py_ssize_t size,
7026 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007027{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7029}
7030
Victor Stinner3a50e702011-10-18 21:21:00 +02007031static DWORD
7032encode_code_page_flags(UINT code_page, const char *errors)
7033{
7034 if (code_page == CP_UTF8) {
7035 if (winver.dwMajorVersion >= 6)
7036 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7037 and later */
7038 return WC_ERR_INVALID_CHARS;
7039 else
7040 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7041 return 0;
7042 }
7043 else if (code_page == CP_UTF7) {
7044 /* CP_UTF7 only supports flags=0 */
7045 return 0;
7046 }
7047 else {
7048 if (errors != NULL && strcmp(errors, "replace") == 0)
7049 return 0;
7050 else
7051 return WC_NO_BEST_FIT_CHARS;
7052 }
7053}
7054
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007055/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 * Encode a Unicode string to a Windows code page into a byte string in strict
7057 * mode.
7058 *
7059 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7060 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007062static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007063encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007064 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066{
Victor Stinner554f3f02010-06-16 23:33:54 +00007067 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 BOOL *pusedDefaultChar = &usedDefaultChar;
7069 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007070 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007071 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007072 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 const DWORD flags = encode_code_page_flags(code_page, NULL);
7074 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007075 /* Create a substring so that we can get the UTF-16 representation
7076 of just the slice under consideration. */
7077 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078
Martin v. Löwis3d325192011-11-04 18:23:06 +01007079 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007080
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007082 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007084 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007085
Victor Stinner2fc507f2011-11-04 20:06:39 +01007086 substring = PyUnicode_Substring(unicode, offset, offset+len);
7087 if (substring == NULL)
7088 return -1;
7089 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7090 if (p == NULL) {
7091 Py_DECREF(substring);
7092 return -1;
7093 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007094
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007095 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 outsize = WideCharToMultiByte(code_page, flags,
7097 p, size,
7098 NULL, 0,
7099 NULL, pusedDefaultChar);
7100 if (outsize <= 0)
7101 goto error;
7102 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007103 if (pusedDefaultChar && *pusedDefaultChar) {
7104 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007106 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007107
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007111 if (*outbytes == NULL) {
7112 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007114 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116 }
7117 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 const Py_ssize_t n = PyBytes_Size(*outbytes);
7120 if (outsize > PY_SSIZE_T_MAX - n) {
7121 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007122 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007125 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7126 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007128 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130 }
7131
7132 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 outsize = WideCharToMultiByte(code_page, flags,
7134 p, size,
7135 out, outsize,
7136 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007137 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 if (outsize <= 0)
7139 goto error;
7140 if (pusedDefaultChar && *pusedDefaultChar)
7141 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007143
Victor Stinner3a50e702011-10-18 21:21:00 +02007144error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007145 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7147 return -2;
7148 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007150}
7151
Victor Stinner3a50e702011-10-18 21:21:00 +02007152/*
7153 * Encode a Unicode string to a Windows code page into a byte string using a
7154 * error handler.
7155 *
7156 * Returns consumed characters if succeed, or raise a WindowsError and returns
7157 * -1 on other error.
7158 */
7159static int
7160encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007161 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007162 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007163{
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007165 Py_ssize_t pos = unicode_offset;
7166 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 /* Ideally, we should get reason from FormatMessage. This is the Windows
7168 2000 English version of the message. */
7169 const char *reason = "invalid character";
7170 /* 4=maximum length of a UTF-8 sequence */
7171 char buffer[4];
7172 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7173 Py_ssize_t outsize;
7174 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 PyObject *errorHandler = NULL;
7176 PyObject *exc = NULL;
7177 PyObject *encoding_obj = NULL;
7178 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007179 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 PyObject *rep;
7181 int ret = -1;
7182
7183 assert(insize > 0);
7184
7185 encoding = code_page_name(code_page, &encoding_obj);
7186 if (encoding == NULL)
7187 return -1;
7188
7189 if (errors == NULL || strcmp(errors, "strict") == 0) {
7190 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7191 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007192 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 if (exc != NULL) {
7194 PyCodec_StrictErrors(exc);
7195 Py_DECREF(exc);
7196 }
7197 Py_XDECREF(encoding_obj);
7198 return -1;
7199 }
7200
7201 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7202 pusedDefaultChar = &usedDefaultChar;
7203 else
7204 pusedDefaultChar = NULL;
7205
7206 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7207 PyErr_NoMemory();
7208 goto error;
7209 }
7210 outsize = insize * Py_ARRAY_LENGTH(buffer);
7211
7212 if (*outbytes == NULL) {
7213 /* Create string object */
7214 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7215 if (*outbytes == NULL)
7216 goto error;
7217 out = PyBytes_AS_STRING(*outbytes);
7218 }
7219 else {
7220 /* Extend string object */
7221 Py_ssize_t n = PyBytes_Size(*outbytes);
7222 if (n > PY_SSIZE_T_MAX - outsize) {
7223 PyErr_NoMemory();
7224 goto error;
7225 }
7226 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7227 goto error;
7228 out = PyBytes_AS_STRING(*outbytes) + n;
7229 }
7230
7231 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7235 wchar_t chars[2];
7236 int charsize;
7237 if (ch < 0x10000) {
7238 chars[0] = (wchar_t)ch;
7239 charsize = 1;
7240 }
7241 else {
7242 ch -= 0x10000;
7243 chars[0] = 0xd800 + (ch >> 10);
7244 chars[1] = 0xdc00 + (ch & 0x3ff);
7245 charsize = 2;
7246 }
7247
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007249 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 buffer, Py_ARRAY_LENGTH(buffer),
7251 NULL, pusedDefaultChar);
7252 if (outsize > 0) {
7253 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7254 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007255 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 memcpy(out, buffer, outsize);
7257 out += outsize;
7258 continue;
7259 }
7260 }
7261 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7262 PyErr_SetFromWindowsErr(0);
7263 goto error;
7264 }
7265
Victor Stinner3a50e702011-10-18 21:21:00 +02007266 rep = unicode_encode_call_errorhandler(
7267 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007268 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007269 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 if (rep == NULL)
7271 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007272 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273
7274 if (PyBytes_Check(rep)) {
7275 outsize = PyBytes_GET_SIZE(rep);
7276 if (outsize != 1) {
7277 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7278 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7279 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7280 Py_DECREF(rep);
7281 goto error;
7282 }
7283 out = PyBytes_AS_STRING(*outbytes) + offset;
7284 }
7285 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7286 out += outsize;
7287 }
7288 else {
7289 Py_ssize_t i;
7290 enum PyUnicode_Kind kind;
7291 void *data;
7292
Benjamin Petersonbac79492012-01-14 13:34:47 -05007293 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 Py_DECREF(rep);
7295 goto error;
7296 }
7297
7298 outsize = PyUnicode_GET_LENGTH(rep);
7299 if (outsize != 1) {
7300 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7301 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7302 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7303 Py_DECREF(rep);
7304 goto error;
7305 }
7306 out = PyBytes_AS_STRING(*outbytes) + offset;
7307 }
7308 kind = PyUnicode_KIND(rep);
7309 data = PyUnicode_DATA(rep);
7310 for (i=0; i < outsize; i++) {
7311 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7312 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007313 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007314 encoding, unicode,
7315 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 "unable to encode error handler result to ASCII");
7317 Py_DECREF(rep);
7318 goto error;
7319 }
7320 *out = (unsigned char)ch;
7321 out++;
7322 }
7323 }
7324 Py_DECREF(rep);
7325 }
7326 /* write a NUL byte */
7327 *out = 0;
7328 outsize = out - PyBytes_AS_STRING(*outbytes);
7329 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7330 if (_PyBytes_Resize(outbytes, outsize) < 0)
7331 goto error;
7332 ret = 0;
7333
7334error:
7335 Py_XDECREF(encoding_obj);
7336 Py_XDECREF(errorHandler);
7337 Py_XDECREF(exc);
7338 return ret;
7339}
7340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341static PyObject *
7342encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007343 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007344 const char *errors)
7345{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007346 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007348 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007349 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007350
Benjamin Petersonbac79492012-01-14 13:34:47 -05007351 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007352 return NULL;
7353 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007354
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 if (code_page < 0) {
7356 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7357 return NULL;
7358 }
7359
Martin v. Löwis3d325192011-11-04 18:23:06 +01007360 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007361 return PyBytes_FromStringAndSize(NULL, 0);
7362
Victor Stinner7581cef2011-11-03 22:32:33 +01007363 offset = 0;
7364 do
7365 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007367 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007368 chunks. */
7369 if (len > INT_MAX/2) {
7370 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007371 done = 0;
7372 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007373 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007375 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007376 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 done = 1;
7378 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007379
Victor Stinner76a31a62011-11-04 00:05:13 +01007380 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007381 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 errors);
7383 if (ret == -2)
7384 ret = encode_code_page_errors(code_page, &outbytes,
7385 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007386 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007387 if (ret < 0) {
7388 Py_XDECREF(outbytes);
7389 return NULL;
7390 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391
Victor Stinner7581cef2011-11-03 22:32:33 +01007392 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007393 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007394 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 return outbytes;
7397}
7398
7399PyObject *
7400PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7401 Py_ssize_t size,
7402 const char *errors)
7403{
Victor Stinner7581cef2011-11-03 22:32:33 +01007404 PyObject *unicode, *res;
7405 unicode = PyUnicode_FromUnicode(p, size);
7406 if (unicode == NULL)
7407 return NULL;
7408 res = encode_code_page(CP_ACP, unicode, errors);
7409 Py_DECREF(unicode);
7410 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007411}
7412
7413PyObject *
7414PyUnicode_EncodeCodePage(int code_page,
7415 PyObject *unicode,
7416 const char *errors)
7417{
Victor Stinner7581cef2011-11-03 22:32:33 +01007418 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007419}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007420
Alexander Belopolsky40018472011-02-26 01:02:56 +00007421PyObject *
7422PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007423{
7424 if (!PyUnicode_Check(unicode)) {
7425 PyErr_BadArgument();
7426 return NULL;
7427 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007428 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007429}
7430
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431#undef NEED_RETRY
7432
Victor Stinner99b95382011-07-04 14:23:54 +02007433#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007434
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435/* --- Character Mapping Codec -------------------------------------------- */
7436
Alexander Belopolsky40018472011-02-26 01:02:56 +00007437PyObject *
7438PyUnicode_DecodeCharmap(const char *s,
7439 Py_ssize_t size,
7440 PyObject *mapping,
7441 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007444 Py_ssize_t startinpos;
7445 Py_ssize_t endinpos;
7446 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007447 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007448 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007449 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 PyObject *errorHandler = NULL;
7451 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 /* Default to Latin-1 */
7454 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007457 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007461 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007462 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007463 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007464 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007465 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007466 enum PyUnicode_Kind mapkind;
7467 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007468 Py_UCS4 x;
7469
Benjamin Petersonbac79492012-01-14 13:34:47 -05007470 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007471 return NULL;
7472
7473 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007474 mapdata = PyUnicode_DATA(mapping);
7475 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007477 unsigned char ch;
7478 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7479 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7480 if (outkind == PyUnicode_1BYTE_KIND) {
7481 void *outdata = PyUnicode_DATA(v);
7482 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7483 while (s < e) {
7484 unsigned char ch = *s;
7485 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7486 if (x > maxchar)
7487 goto Error;
7488 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7489 ++s;
7490 }
7491 break;
7492 }
7493 else if (outkind == PyUnicode_2BYTE_KIND) {
7494 void *outdata = PyUnicode_DATA(v);
7495 while (s < e) {
7496 unsigned char ch = *s;
7497 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7498 if (x == 0xFFFE)
7499 goto Error;
7500 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7501 ++s;
7502 }
7503 break;
7504 }
7505 }
7506 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007507
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007509 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007510 else
7511 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007512Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007513 if (x == 0xfffe)
7514 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 startinpos = s-starts;
7517 endinpos = startinpos+1;
7518 if (unicode_decode_call_errorhandler(
7519 errors, &errorHandler,
7520 "charmap", "character maps to <undefined>",
7521 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007522 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 goto onError;
7524 }
7525 continue;
7526 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007527
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007528 if (unicode_putchar(&v, &outpos, x) < 0)
7529 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007531 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007532 }
7533 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 while (s < e) {
7535 unsigned char ch = *s;
7536 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007537
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7539 w = PyLong_FromLong((long)ch);
7540 if (w == NULL)
7541 goto onError;
7542 x = PyObject_GetItem(mapping, w);
7543 Py_DECREF(w);
7544 if (x == NULL) {
7545 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7546 /* No mapping found means: mapping is undefined. */
7547 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007548 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 } else
7550 goto onError;
7551 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007552
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007554 if (x == Py_None)
7555 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 if (PyLong_Check(x)) {
7557 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007558 if (value == 0xFFFE)
7559 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007560 if (value < 0 || value > MAX_UNICODE) {
7561 PyErr_Format(PyExc_TypeError,
7562 "character mapping must be in range(0x%lx)",
7563 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 Py_DECREF(x);
7565 goto onError;
7566 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007567 if (unicode_putchar(&v, &outpos, value) < 0) {
7568 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007569 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007573 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007574
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007575 if (PyUnicode_READY(x) == -1) {
7576 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007577 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007578 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007579 targetsize = PyUnicode_GET_LENGTH(x);
7580
7581 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007583 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007584 if (value == 0xFFFE)
7585 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007586 if (unicode_putchar(&v, &outpos, value) < 0) {
7587 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007588 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007589 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 else if (targetsize > 1) {
7592 /* 1-n mapping */
7593 if (targetsize > extrachars) {
7594 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 Py_ssize_t needed = (targetsize - extrachars) + \
7596 (targetsize << 2);
7597 extrachars += needed;
7598 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007599 if (unicode_resize(&v,
7600 PyUnicode_GET_LENGTH(v) + needed) < 0)
7601 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 Py_DECREF(x);
7603 goto onError;
7604 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007606 if (unicode_widen(&v, outpos,
7607 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7608 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007609 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007610 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007611 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7612 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 extrachars -= targetsize;
7614 }
7615 /* 1-0 mapping: skip the character */
7616 }
7617 else {
7618 /* wrong return value */
7619 PyErr_SetString(PyExc_TypeError,
7620 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007621 Py_DECREF(x);
7622 goto onError;
7623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 Py_DECREF(x);
7625 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007626 continue;
7627Undefined:
7628 /* undefined mapping */
7629 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007630 startinpos = s-starts;
7631 endinpos = startinpos+1;
7632 if (unicode_decode_call_errorhandler(
7633 errors, &errorHandler,
7634 "charmap", "character maps to <undefined>",
7635 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007636 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007637 goto onError;
7638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007641 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007642 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643 Py_XDECREF(errorHandler);
7644 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007645 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007646
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 Py_XDECREF(errorHandler);
7649 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650 Py_XDECREF(v);
7651 return NULL;
7652}
7653
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007654/* Charmap encoding: the lookup table */
7655
Alexander Belopolsky40018472011-02-26 01:02:56 +00007656struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 PyObject_HEAD
7658 unsigned char level1[32];
7659 int count2, count3;
7660 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007661};
7662
7663static PyObject*
7664encoding_map_size(PyObject *obj, PyObject* args)
7665{
7666 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007667 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007669}
7670
7671static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007672 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 PyDoc_STR("Return the size (in bytes) of this object") },
7674 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007675};
7676
7677static void
7678encoding_map_dealloc(PyObject* o)
7679{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007680 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007681}
7682
7683static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007684 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 "EncodingMap", /*tp_name*/
7686 sizeof(struct encoding_map), /*tp_basicsize*/
7687 0, /*tp_itemsize*/
7688 /* methods */
7689 encoding_map_dealloc, /*tp_dealloc*/
7690 0, /*tp_print*/
7691 0, /*tp_getattr*/
7692 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007693 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 0, /*tp_repr*/
7695 0, /*tp_as_number*/
7696 0, /*tp_as_sequence*/
7697 0, /*tp_as_mapping*/
7698 0, /*tp_hash*/
7699 0, /*tp_call*/
7700 0, /*tp_str*/
7701 0, /*tp_getattro*/
7702 0, /*tp_setattro*/
7703 0, /*tp_as_buffer*/
7704 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7705 0, /*tp_doc*/
7706 0, /*tp_traverse*/
7707 0, /*tp_clear*/
7708 0, /*tp_richcompare*/
7709 0, /*tp_weaklistoffset*/
7710 0, /*tp_iter*/
7711 0, /*tp_iternext*/
7712 encoding_map_methods, /*tp_methods*/
7713 0, /*tp_members*/
7714 0, /*tp_getset*/
7715 0, /*tp_base*/
7716 0, /*tp_dict*/
7717 0, /*tp_descr_get*/
7718 0, /*tp_descr_set*/
7719 0, /*tp_dictoffset*/
7720 0, /*tp_init*/
7721 0, /*tp_alloc*/
7722 0, /*tp_new*/
7723 0, /*tp_free*/
7724 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007725};
7726
7727PyObject*
7728PyUnicode_BuildEncodingMap(PyObject* string)
7729{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007730 PyObject *result;
7731 struct encoding_map *mresult;
7732 int i;
7733 int need_dict = 0;
7734 unsigned char level1[32];
7735 unsigned char level2[512];
7736 unsigned char *mlevel1, *mlevel2, *mlevel3;
7737 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007738 int kind;
7739 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007740 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007741 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007742
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007743 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744 PyErr_BadArgument();
7745 return NULL;
7746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 kind = PyUnicode_KIND(string);
7748 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007749 length = PyUnicode_GET_LENGTH(string);
7750 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751 memset(level1, 0xFF, sizeof level1);
7752 memset(level2, 0xFF, sizeof level2);
7753
7754 /* If there isn't a one-to-one mapping of NULL to \0,
7755 or if there are non-BMP characters, we need to use
7756 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007757 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007759 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007761 ch = PyUnicode_READ(kind, data, i);
7762 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763 need_dict = 1;
7764 break;
7765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007766 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 /* unmapped character */
7768 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007769 l1 = ch >> 11;
7770 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771 if (level1[l1] == 0xFF)
7772 level1[l1] = count2++;
7773 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 }
7776
7777 if (count2 >= 0xFF || count3 >= 0xFF)
7778 need_dict = 1;
7779
7780 if (need_dict) {
7781 PyObject *result = PyDict_New();
7782 PyObject *key, *value;
7783 if (!result)
7784 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007785 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007786 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007787 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788 if (!key || !value)
7789 goto failed1;
7790 if (PyDict_SetItem(result, key, value) == -1)
7791 goto failed1;
7792 Py_DECREF(key);
7793 Py_DECREF(value);
7794 }
7795 return result;
7796 failed1:
7797 Py_XDECREF(key);
7798 Py_XDECREF(value);
7799 Py_DECREF(result);
7800 return NULL;
7801 }
7802
7803 /* Create a three-level trie */
7804 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7805 16*count2 + 128*count3 - 1);
7806 if (!result)
7807 return PyErr_NoMemory();
7808 PyObject_Init(result, &EncodingMapType);
7809 mresult = (struct encoding_map*)result;
7810 mresult->count2 = count2;
7811 mresult->count3 = count3;
7812 mlevel1 = mresult->level1;
7813 mlevel2 = mresult->level23;
7814 mlevel3 = mresult->level23 + 16*count2;
7815 memcpy(mlevel1, level1, 32);
7816 memset(mlevel2, 0xFF, 16*count2);
7817 memset(mlevel3, 0, 128*count3);
7818 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007819 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007821 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7822 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 /* unmapped character */
7824 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007825 o1 = ch>>11;
7826 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 i2 = 16*mlevel1[o1] + o2;
7828 if (mlevel2[i2] == 0xFF)
7829 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007830 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831 i3 = 128*mlevel2[i2] + o3;
7832 mlevel3[i3] = i;
7833 }
7834 return result;
7835}
7836
7837static int
Victor Stinner22168992011-11-20 17:09:18 +01007838encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839{
7840 struct encoding_map *map = (struct encoding_map*)mapping;
7841 int l1 = c>>11;
7842 int l2 = (c>>7) & 0xF;
7843 int l3 = c & 0x7F;
7844 int i;
7845
Victor Stinner22168992011-11-20 17:09:18 +01007846 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 if (c == 0)
7849 return 0;
7850 /* level 1*/
7851 i = map->level1[l1];
7852 if (i == 0xFF) {
7853 return -1;
7854 }
7855 /* level 2*/
7856 i = map->level23[16*i+l2];
7857 if (i == 0xFF) {
7858 return -1;
7859 }
7860 /* level 3 */
7861 i = map->level23[16*map->count2 + 128*i + l3];
7862 if (i == 0) {
7863 return -1;
7864 }
7865 return i;
7866}
7867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868/* Lookup the character ch in the mapping. If the character
7869 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007870 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007871static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007872charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873{
Christian Heimes217cfd12007-12-02 14:31:20 +00007874 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875 PyObject *x;
7876
7877 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879 x = PyObject_GetItem(mapping, w);
7880 Py_DECREF(w);
7881 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7883 /* No mapping found means: mapping is undefined. */
7884 PyErr_Clear();
7885 x = Py_None;
7886 Py_INCREF(x);
7887 return x;
7888 } else
7889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007891 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007893 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 long value = PyLong_AS_LONG(x);
7895 if (value < 0 || value > 255) {
7896 PyErr_SetString(PyExc_TypeError,
7897 "character mapping must be in range(256)");
7898 Py_DECREF(x);
7899 return NULL;
7900 }
7901 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007903 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 /* wrong return value */
7907 PyErr_Format(PyExc_TypeError,
7908 "character mapping must return integer, bytes or None, not %.400s",
7909 x->ob_type->tp_name);
7910 Py_DECREF(x);
7911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 }
7913}
7914
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007916charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7919 /* exponentially overallocate to minimize reallocations */
7920 if (requiredsize < 2*outsize)
7921 requiredsize = 2*outsize;
7922 if (_PyBytes_Resize(outobj, requiredsize))
7923 return -1;
7924 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007925}
7926
Benjamin Peterson14339b62009-01-31 16:36:08 +00007927typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007929} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007930/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007931 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932 space is available. Return a new reference to the object that
7933 was put in the output buffer, or Py_None, if the mapping was undefined
7934 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007935 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007936static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007937charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007938 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007939{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 PyObject *rep;
7941 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007942 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943
Christian Heimes90aa7642007-12-19 02:45:37 +00007944 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 if (res == -1)
7948 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 if (outsize<requiredsize)
7950 if (charmapencode_resize(outobj, outpos, requiredsize))
7951 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007952 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 outstart[(*outpos)++] = (char)res;
7954 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007955 }
7956
7957 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 Py_DECREF(rep);
7962 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007963 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 if (PyLong_Check(rep)) {
7965 Py_ssize_t requiredsize = *outpos+1;
7966 if (outsize<requiredsize)
7967 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7968 Py_DECREF(rep);
7969 return enc_EXCEPTION;
7970 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007971 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 else {
7975 const char *repchars = PyBytes_AS_STRING(rep);
7976 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7977 Py_ssize_t requiredsize = *outpos+repsize;
7978 if (outsize<requiredsize)
7979 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7980 Py_DECREF(rep);
7981 return enc_EXCEPTION;
7982 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007983 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 memcpy(outstart + *outpos, repchars, repsize);
7985 *outpos += repsize;
7986 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007987 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007988 Py_DECREF(rep);
7989 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990}
7991
7992/* handle an error in PyUnicode_EncodeCharmap
7993 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007994static int
7995charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007996 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007998 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007999 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000{
8001 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008002 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008003 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008004 enum PyUnicode_Kind kind;
8005 void *data;
8006 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008008 Py_ssize_t collstartpos = *inpos;
8009 Py_ssize_t collendpos = *inpos+1;
8010 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 char *encoding = "charmap";
8012 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008014 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008015 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016
Benjamin Petersonbac79492012-01-14 13:34:47 -05008017 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008018 return -1;
8019 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 /* find all unencodable characters */
8021 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008022 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008023 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008024 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008025 val = encoding_map_lookup(ch, mapping);
8026 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 break;
8028 ++collendpos;
8029 continue;
8030 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008031
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008032 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8033 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 if (rep==NULL)
8035 return -1;
8036 else if (rep!=Py_None) {
8037 Py_DECREF(rep);
8038 break;
8039 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008042 }
8043 /* cache callback name lookup
8044 * (if not done yet, i.e. it's the first error) */
8045 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 if ((errors==NULL) || (!strcmp(errors, "strict")))
8047 *known_errorHandler = 1;
8048 else if (!strcmp(errors, "replace"))
8049 *known_errorHandler = 2;
8050 else if (!strcmp(errors, "ignore"))
8051 *known_errorHandler = 3;
8052 else if (!strcmp(errors, "xmlcharrefreplace"))
8053 *known_errorHandler = 4;
8054 else
8055 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008056 }
8057 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008059 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 return -1;
8061 case 2: /* replace */
8062 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 x = charmapencode_output('?', mapping, res, respos);
8064 if (x==enc_EXCEPTION) {
8065 return -1;
8066 }
8067 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008068 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 return -1;
8070 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 }
8072 /* fall through */
8073 case 3: /* ignore */
8074 *inpos = collendpos;
8075 break;
8076 case 4: /* xmlcharrefreplace */
8077 /* generate replacement (temporarily (mis)uses p) */
8078 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 char buffer[2+29+1+1];
8080 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008081 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 for (cp = buffer; *cp; ++cp) {
8083 x = charmapencode_output(*cp, mapping, res, respos);
8084 if (x==enc_EXCEPTION)
8085 return -1;
8086 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008087 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return -1;
8089 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008090 }
8091 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 *inpos = collendpos;
8093 break;
8094 default:
8095 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008096 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008100 if (PyBytes_Check(repunicode)) {
8101 /* Directly copy bytes result to output. */
8102 Py_ssize_t outsize = PyBytes_Size(*res);
8103 Py_ssize_t requiredsize;
8104 repsize = PyBytes_Size(repunicode);
8105 requiredsize = *respos + repsize;
8106 if (requiredsize > outsize)
8107 /* Make room for all additional bytes. */
8108 if (charmapencode_resize(res, respos, requiredsize)) {
8109 Py_DECREF(repunicode);
8110 return -1;
8111 }
8112 memcpy(PyBytes_AsString(*res) + *respos,
8113 PyBytes_AsString(repunicode), repsize);
8114 *respos += repsize;
8115 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008116 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008117 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008120 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008121 Py_DECREF(repunicode);
8122 return -1;
8123 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008124 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008125 data = PyUnicode_DATA(repunicode);
8126 kind = PyUnicode_KIND(repunicode);
8127 for (index = 0; index < repsize; index++) {
8128 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8129 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008131 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return -1;
8133 }
8134 else if (x==enc_FAILED) {
8135 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008136 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 return -1;
8138 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139 }
8140 *inpos = newpos;
8141 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 }
8143 return 0;
8144}
8145
Alexander Belopolsky40018472011-02-26 01:02:56 +00008146PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008147_PyUnicode_EncodeCharmap(PyObject *unicode,
8148 PyObject *mapping,
8149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 /* output object */
8152 PyObject *res = NULL;
8153 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008154 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008155 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 PyObject *errorHandler = NULL;
8159 PyObject *exc = NULL;
8160 /* the following variable is used for caching string comparisons
8161 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8162 * 3=ignore, 4=xmlcharrefreplace */
8163 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164
Benjamin Petersonbac79492012-01-14 13:34:47 -05008165 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008166 return NULL;
8167 size = PyUnicode_GET_LENGTH(unicode);
8168
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 /* Default to Latin-1 */
8170 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008171 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173 /* allocate enough for a simple encoding without
8174 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008175 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 if (res == NULL)
8177 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008178 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008181 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008182 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008184 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 if (x==enc_EXCEPTION) /* error */
8186 goto onError;
8187 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008188 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 &exc,
8190 &known_errorHandler, &errorHandler, errors,
8191 &res, &respos)) {
8192 goto onError;
8193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 else
8196 /* done with this character => adjust input position */
8197 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008201 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008202 if (_PyBytes_Resize(&res, respos) < 0)
8203 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205 Py_XDECREF(exc);
8206 Py_XDECREF(errorHandler);
8207 return res;
8208
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 Py_XDECREF(res);
8211 Py_XDECREF(exc);
8212 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 return NULL;
8214}
8215
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216/* Deprecated */
8217PyObject *
8218PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8219 Py_ssize_t size,
8220 PyObject *mapping,
8221 const char *errors)
8222{
8223 PyObject *result;
8224 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8225 if (unicode == NULL)
8226 return NULL;
8227 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8228 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008229 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008230}
8231
Alexander Belopolsky40018472011-02-26 01:02:56 +00008232PyObject *
8233PyUnicode_AsCharmapString(PyObject *unicode,
8234 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235{
8236 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 PyErr_BadArgument();
8238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241}
8242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244static void
8245make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008247 Py_ssize_t startpos, Py_ssize_t endpos,
8248 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 *exceptionObject = _PyUnicodeTranslateError_Create(
8252 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 }
8254 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8256 goto onError;
8257 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8258 goto onError;
8259 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8260 goto onError;
8261 return;
8262 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008263 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 }
8265}
8266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268static void
8269raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008271 Py_ssize_t startpos, Py_ssize_t endpos,
8272 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273{
8274 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278}
8279
8280/* error handling callback helper:
8281 build arguments, call the callback and check the arguments,
8282 put the result into newpos and return the replacement string, which
8283 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008284static PyObject *
8285unicode_translate_call_errorhandler(const char *errors,
8286 PyObject **errorHandler,
8287 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008289 Py_ssize_t startpos, Py_ssize_t endpos,
8290 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008292 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008294 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 PyObject *restuple;
8296 PyObject *resunicode;
8297
8298 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 }
8303
8304 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308
8309 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008314 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 Py_DECREF(restuple);
8316 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 }
8318 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 &resunicode, &i_newpos)) {
8320 Py_DECREF(restuple);
8321 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008323 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008325 else
8326 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8329 Py_DECREF(restuple);
8330 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008331 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332 Py_INCREF(resunicode);
8333 Py_DECREF(restuple);
8334 return resunicode;
8335}
8336
8337/* Lookup the character ch in the mapping and put the result in result,
8338 which must be decrefed by the caller.
8339 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008340static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342{
Christian Heimes217cfd12007-12-02 14:31:20 +00008343 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 PyObject *x;
8345
8346 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 x = PyObject_GetItem(mapping, w);
8349 Py_DECREF(w);
8350 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8352 /* No mapping found means: use 1:1 mapping. */
8353 PyErr_Clear();
8354 *result = NULL;
8355 return 0;
8356 } else
8357 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 }
8359 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 *result = x;
8361 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008363 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 long value = PyLong_AS_LONG(x);
8365 long max = PyUnicode_GetMax();
8366 if (value < 0 || value > max) {
8367 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008368 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 Py_DECREF(x);
8370 return -1;
8371 }
8372 *result = x;
8373 return 0;
8374 }
8375 else if (PyUnicode_Check(x)) {
8376 *result = x;
8377 return 0;
8378 }
8379 else {
8380 /* wrong return value */
8381 PyErr_SetString(PyExc_TypeError,
8382 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 Py_DECREF(x);
8384 return -1;
8385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386}
8387/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 if not reallocate and adjust various state variables.
8389 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008390static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008395 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008396 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 /* exponentially overallocate to minimize reallocations */
8398 if (requiredsize < 2 * oldsize)
8399 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008400 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8401 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008403 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 }
8406 return 0;
8407}
8408/* lookup the character, put the result in the output string and adjust
8409 various state variables. Return a new reference to the object that
8410 was put in the output buffer in *result, or Py_None, if the mapping was
8411 undefined (in which case no character was written).
8412 The called must decref result.
8413 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8416 PyObject *mapping, Py_UCS4 **output,
8417 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008418 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8421 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008425 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 }
8427 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008429 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
8433 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 Py_ssize_t repsize;
8435 if (PyUnicode_READY(*res) == -1)
8436 return -1;
8437 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 if (repsize==1) {
8439 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 }
8442 else if (repsize!=0) {
8443 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 Py_ssize_t requiredsize = *opos +
8445 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 Py_ssize_t i;
8448 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 for(i = 0; i < repsize; i++)
8451 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 }
8454 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456 return 0;
8457}
8458
Alexander Belopolsky40018472011-02-26 01:02:56 +00008459PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460_PyUnicode_TranslateCharmap(PyObject *input,
8461 PyObject *mapping,
8462 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 /* input object */
8465 char *idata;
8466 Py_ssize_t size, i;
8467 int kind;
8468 /* output buffer */
8469 Py_UCS4 *output = NULL;
8470 Py_ssize_t osize;
8471 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 char *reason = "character maps to <undefined>";
8475 PyObject *errorHandler = NULL;
8476 PyObject *exc = NULL;
8477 /* the following variable is used for caching string comparisons
8478 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8479 * 3=ignore, 4=xmlcharrefreplace */
8480 int known_errorHandler = -1;
8481
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 PyErr_BadArgument();
8484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 if (PyUnicode_READY(input) == -1)
8488 return NULL;
8489 idata = (char*)PyUnicode_DATA(input);
8490 kind = PyUnicode_KIND(input);
8491 size = PyUnicode_GET_LENGTH(input);
8492 i = 0;
8493
8494 if (size == 0) {
8495 Py_INCREF(input);
8496 return input;
8497 }
8498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 /* allocate enough for a simple 1:1 translation without
8500 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 osize = size;
8502 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8503 opos = 0;
8504 if (output == NULL) {
8505 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 /* try to encode it */
8511 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 if (charmaptranslate_output(input, i, mapping,
8513 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 Py_XDECREF(x);
8515 goto onError;
8516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 else { /* untranslatable character */
8521 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8522 Py_ssize_t repsize;
8523 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_ssize_t collstart = i;
8527 Py_ssize_t collend = i+1;
8528 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 while (collend < size) {
8532 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 goto onError;
8534 Py_XDECREF(x);
8535 if (x!=Py_None)
8536 break;
8537 ++collend;
8538 }
8539 /* cache callback name lookup
8540 * (if not done yet, i.e. it's the first error) */
8541 if (known_errorHandler==-1) {
8542 if ((errors==NULL) || (!strcmp(errors, "strict")))
8543 known_errorHandler = 1;
8544 else if (!strcmp(errors, "replace"))
8545 known_errorHandler = 2;
8546 else if (!strcmp(errors, "ignore"))
8547 known_errorHandler = 3;
8548 else if (!strcmp(errors, "xmlcharrefreplace"))
8549 known_errorHandler = 4;
8550 else
8551 known_errorHandler = 0;
8552 }
8553 switch (known_errorHandler) {
8554 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 raise_translate_exception(&exc, input, collstart,
8556 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008557 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 case 2: /* replace */
8559 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 for (coll = collstart; coll<collend; coll++)
8561 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 /* fall through */
8563 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 break;
8566 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 /* generate replacement (temporarily (mis)uses i) */
8568 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 char buffer[2+29+1+1];
8570 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8572 if (charmaptranslate_makespace(&output, &osize,
8573 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 goto onError;
8575 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 break;
8580 default:
8581 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 reason, input, &exc,
8583 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008584 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008586 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008587 Py_DECREF(repunicode);
8588 goto onError;
8589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 repsize = PyUnicode_GET_LENGTH(repunicode);
8592 if (charmaptranslate_makespace(&output, &osize,
8593 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 Py_DECREF(repunicode);
8595 goto onError;
8596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 for (uni2 = 0; repsize-->0; ++uni2)
8598 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8599 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008602 }
8603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8605 if (!res)
8606 goto onError;
8607 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 Py_XDECREF(exc);
8609 Py_XDECREF(errorHandler);
8610 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614 Py_XDECREF(exc);
8615 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 return NULL;
8617}
8618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619/* Deprecated. Use PyUnicode_Translate instead. */
8620PyObject *
8621PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8622 Py_ssize_t size,
8623 PyObject *mapping,
8624 const char *errors)
8625{
Christian Heimes5f520f42012-09-11 14:03:25 +02008626 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8628 if (!unicode)
8629 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008630 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8631 Py_DECREF(unicode);
8632 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633}
8634
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635PyObject *
8636PyUnicode_Translate(PyObject *str,
8637 PyObject *mapping,
8638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
8640 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008641
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 str = PyUnicode_FromObject(str);
8643 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008644 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 Py_DECREF(str);
8647 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648}
Tim Petersced69f82003-09-16 20:30:58 +00008649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008651fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652{
8653 /* No need to call PyUnicode_READY(self) because this function is only
8654 called as a callback from fixup() which does it already. */
8655 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8656 const int kind = PyUnicode_KIND(self);
8657 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008658 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008659 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 Py_ssize_t i;
8661
8662 for (i = 0; i < len; ++i) {
8663 ch = PyUnicode_READ(kind, data, i);
8664 fixed = 0;
8665 if (ch > 127) {
8666 if (Py_UNICODE_ISSPACE(ch))
8667 fixed = ' ';
8668 else {
8669 const int decimal = Py_UNICODE_TODECIMAL(ch);
8670 if (decimal >= 0)
8671 fixed = '0' + decimal;
8672 }
8673 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008674 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008675 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 PyUnicode_WRITE(kind, data, i, fixed);
8677 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008678 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008679 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 }
8682
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008683 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684}
8685
8686PyObject *
8687_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8688{
8689 if (!PyUnicode_Check(unicode)) {
8690 PyErr_BadInternalCall();
8691 return NULL;
8692 }
8693 if (PyUnicode_READY(unicode) == -1)
8694 return NULL;
8695 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8696 /* If the string is already ASCII, just return the same string */
8697 Py_INCREF(unicode);
8698 return unicode;
8699 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008700 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701}
8702
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008703PyObject *
8704PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8705 Py_ssize_t length)
8706{
Victor Stinnerf0124502011-11-21 23:12:56 +01008707 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008708 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008709 Py_UCS4 maxchar;
8710 enum PyUnicode_Kind kind;
8711 void *data;
8712
Victor Stinner99d7ad02012-02-22 13:37:39 +01008713 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008714 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008715 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008716 if (ch > 127) {
8717 int decimal = Py_UNICODE_TODECIMAL(ch);
8718 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008719 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008720 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008721 }
8722 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008723
8724 /* Copy to a new string */
8725 decimal = PyUnicode_New(length, maxchar);
8726 if (decimal == NULL)
8727 return decimal;
8728 kind = PyUnicode_KIND(decimal);
8729 data = PyUnicode_DATA(decimal);
8730 /* Iterate over code points */
8731 for (i = 0; i < length; i++) {
8732 Py_UNICODE ch = s[i];
8733 if (ch > 127) {
8734 int decimal = Py_UNICODE_TODECIMAL(ch);
8735 if (decimal >= 0)
8736 ch = '0' + decimal;
8737 }
8738 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008740 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008741}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008742/* --- Decimal Encoder ---------------------------------------------------- */
8743
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744int
8745PyUnicode_EncodeDecimal(Py_UNICODE *s,
8746 Py_ssize_t length,
8747 char *output,
8748 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008749{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008750 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008751 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008752 enum PyUnicode_Kind kind;
8753 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008754
8755 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 PyErr_BadArgument();
8757 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008758 }
8759
Victor Stinner42bf7752011-11-21 22:52:58 +01008760 unicode = PyUnicode_FromUnicode(s, length);
8761 if (unicode == NULL)
8762 return -1;
8763
Benjamin Petersonbac79492012-01-14 13:34:47 -05008764 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008765 Py_DECREF(unicode);
8766 return -1;
8767 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008768 kind = PyUnicode_KIND(unicode);
8769 data = PyUnicode_DATA(unicode);
8770
Victor Stinnerb84d7232011-11-22 01:50:07 +01008771 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008772 PyObject *exc;
8773 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008775 Py_ssize_t startpos;
8776
8777 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008778
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008780 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008781 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 decimal = Py_UNICODE_TODECIMAL(ch);
8785 if (decimal >= 0) {
8786 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008787 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 continue;
8789 }
8790 if (0 < ch && ch < 256) {
8791 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008792 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 continue;
8794 }
Victor Stinner6345be92011-11-25 20:09:01 +01008795
Victor Stinner42bf7752011-11-21 22:52:58 +01008796 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008797 exc = NULL;
8798 raise_encode_exception(&exc, "decimal", unicode,
8799 startpos, startpos+1,
8800 "invalid decimal Unicode string");
8801 Py_XDECREF(exc);
8802 Py_DECREF(unicode);
8803 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008804 }
8805 /* 0-terminate the output string */
8806 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008807 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008808 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008809}
8810
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811/* --- Helpers ------------------------------------------------------------ */
8812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008814any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 Py_ssize_t start,
8816 Py_ssize_t end)
8817{
8818 int kind1, kind2, kind;
8819 void *buf1, *buf2;
8820 Py_ssize_t len1, len2, result;
8821
8822 kind1 = PyUnicode_KIND(s1);
8823 kind2 = PyUnicode_KIND(s2);
8824 kind = kind1 > kind2 ? kind1 : kind2;
8825 buf1 = PyUnicode_DATA(s1);
8826 buf2 = PyUnicode_DATA(s2);
8827 if (kind1 != kind)
8828 buf1 = _PyUnicode_AsKind(s1, kind);
8829 if (!buf1)
8830 return -2;
8831 if (kind2 != kind)
8832 buf2 = _PyUnicode_AsKind(s2, kind);
8833 if (!buf2) {
8834 if (kind1 != kind) PyMem_Free(buf1);
8835 return -2;
8836 }
8837 len1 = PyUnicode_GET_LENGTH(s1);
8838 len2 = PyUnicode_GET_LENGTH(s2);
8839
Victor Stinner794d5672011-10-10 03:21:36 +02008840 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008841 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008842 case PyUnicode_1BYTE_KIND:
8843 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8844 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8845 else
8846 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8847 break;
8848 case PyUnicode_2BYTE_KIND:
8849 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8850 break;
8851 case PyUnicode_4BYTE_KIND:
8852 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8853 break;
8854 default:
8855 assert(0); result = -2;
8856 }
8857 }
8858 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008859 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008860 case PyUnicode_1BYTE_KIND:
8861 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8862 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8863 else
8864 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8865 break;
8866 case PyUnicode_2BYTE_KIND:
8867 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8868 break;
8869 case PyUnicode_4BYTE_KIND:
8870 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8871 break;
8872 default:
8873 assert(0); result = -2;
8874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 }
8876
8877 if (kind1 != kind)
8878 PyMem_Free(buf1);
8879 if (kind2 != kind)
8880 PyMem_Free(buf2);
8881
8882 return result;
8883}
8884
8885Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008886_PyUnicode_InsertThousandsGrouping(
8887 PyObject *unicode, Py_ssize_t index,
8888 Py_ssize_t n_buffer,
8889 void *digits, Py_ssize_t n_digits,
8890 Py_ssize_t min_width,
8891 const char *grouping, PyObject *thousands_sep,
8892 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893{
Victor Stinner41a863c2012-02-24 00:37:51 +01008894 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008895 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 Py_ssize_t thousands_sep_len;
8897 Py_ssize_t len;
8898
8899 if (unicode != NULL) {
8900 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008901 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008902 }
8903 else {
8904 kind = PyUnicode_1BYTE_KIND;
8905 data = NULL;
8906 }
8907 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8908 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8909 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8910 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008911 if (thousands_sep_kind < kind) {
8912 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8913 if (!thousands_sep_data)
8914 return -1;
8915 }
8916 else {
8917 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8918 if (!data)
8919 return -1;
8920 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008921 }
8922
Benjamin Petersonead6b532011-12-20 17:23:42 -06008923 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008925 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008926 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008927 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008928 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008929 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008930 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008931 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008932 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008933 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008934 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008935 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008937 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008938 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008939 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008940 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008941 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008943 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008944 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008945 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008946 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008947 break;
8948 default:
8949 assert(0);
8950 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008952 if (unicode != NULL && thousands_sep_kind != kind) {
8953 if (thousands_sep_kind < kind)
8954 PyMem_Free(thousands_sep_data);
8955 else
8956 PyMem_Free(data);
8957 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008958 if (unicode == NULL) {
8959 *maxchar = 127;
8960 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008961 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008962 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008963 }
8964 }
8965 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966}
8967
8968
Thomas Wouters477c8d52006-05-27 19:21:47 +00008969/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008970#define ADJUST_INDICES(start, end, len) \
8971 if (end > len) \
8972 end = len; \
8973 else if (end < 0) { \
8974 end += len; \
8975 if (end < 0) \
8976 end = 0; \
8977 } \
8978 if (start < 0) { \
8979 start += len; \
8980 if (start < 0) \
8981 start = 0; \
8982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008983
Alexander Belopolsky40018472011-02-26 01:02:56 +00008984Py_ssize_t
8985PyUnicode_Count(PyObject *str,
8986 PyObject *substr,
8987 Py_ssize_t start,
8988 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008990 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008991 PyObject* str_obj;
8992 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 int kind1, kind2, kind;
8994 void *buf1 = NULL, *buf2 = NULL;
8995 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008996
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008997 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008998 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009000 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009001 if (!sub_obj) {
9002 Py_DECREF(str_obj);
9003 return -1;
9004 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009005 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009006 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 Py_DECREF(str_obj);
9008 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 }
Tim Petersced69f82003-09-16 20:30:58 +00009010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 kind1 = PyUnicode_KIND(str_obj);
9012 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009013 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009016 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009017 if (kind2 > kind) {
9018 Py_DECREF(sub_obj);
9019 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009020 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009021 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009022 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 if (!buf2)
9025 goto onError;
9026 len1 = PyUnicode_GET_LENGTH(str_obj);
9027 len2 = PyUnicode_GET_LENGTH(sub_obj);
9028
9029 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009030 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009032 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9033 result = asciilib_count(
9034 ((Py_UCS1*)buf1) + start, end - start,
9035 buf2, len2, PY_SSIZE_T_MAX
9036 );
9037 else
9038 result = ucs1lib_count(
9039 ((Py_UCS1*)buf1) + start, end - start,
9040 buf2, len2, PY_SSIZE_T_MAX
9041 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 break;
9043 case PyUnicode_2BYTE_KIND:
9044 result = ucs2lib_count(
9045 ((Py_UCS2*)buf1) + start, end - start,
9046 buf2, len2, PY_SSIZE_T_MAX
9047 );
9048 break;
9049 case PyUnicode_4BYTE_KIND:
9050 result = ucs4lib_count(
9051 ((Py_UCS4*)buf1) + start, end - start,
9052 buf2, len2, PY_SSIZE_T_MAX
9053 );
9054 break;
9055 default:
9056 assert(0); result = 0;
9057 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009058
9059 Py_DECREF(sub_obj);
9060 Py_DECREF(str_obj);
9061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 if (kind2 != kind)
9063 PyMem_Free(buf2);
9064
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 onError:
9067 Py_DECREF(sub_obj);
9068 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 if (kind2 != kind && buf2)
9070 PyMem_Free(buf2);
9071 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
9073
Alexander Belopolsky40018472011-02-26 01:02:56 +00009074Py_ssize_t
9075PyUnicode_Find(PyObject *str,
9076 PyObject *sub,
9077 Py_ssize_t start,
9078 Py_ssize_t end,
9079 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009081 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009082
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009084 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009086 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009087 if (!sub) {
9088 Py_DECREF(str);
9089 return -2;
9090 }
9091 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9092 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 Py_DECREF(str);
9094 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 }
Tim Petersced69f82003-09-16 20:30:58 +00009096
Victor Stinner794d5672011-10-10 03:21:36 +02009097 result = any_find_slice(direction,
9098 str, sub, start, end
9099 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009100
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009102 Py_DECREF(sub);
9103
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 return result;
9105}
9106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107Py_ssize_t
9108PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9109 Py_ssize_t start, Py_ssize_t end,
9110 int direction)
9111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009113 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 if (PyUnicode_READY(str) == -1)
9115 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009116 if (start < 0 || end < 0) {
9117 PyErr_SetString(PyExc_IndexError, "string index out of range");
9118 return -2;
9119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 if (end > PyUnicode_GET_LENGTH(str))
9121 end = PyUnicode_GET_LENGTH(str);
9122 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009123 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9124 kind, end-start, ch, direction);
9125 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009127 else
9128 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129}
9130
Alexander Belopolsky40018472011-02-26 01:02:56 +00009131static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009132tailmatch(PyObject *self,
9133 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009134 Py_ssize_t start,
9135 Py_ssize_t end,
9136 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 int kind_self;
9139 int kind_sub;
9140 void *data_self;
9141 void *data_sub;
9142 Py_ssize_t offset;
9143 Py_ssize_t i;
9144 Py_ssize_t end_sub;
9145
9146 if (PyUnicode_READY(self) == -1 ||
9147 PyUnicode_READY(substring) == -1)
9148 return 0;
9149
9150 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 return 1;
9152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9154 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 kind_self = PyUnicode_KIND(self);
9159 data_self = PyUnicode_DATA(self);
9160 kind_sub = PyUnicode_KIND(substring);
9161 data_sub = PyUnicode_DATA(substring);
9162 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9163
9164 if (direction > 0)
9165 offset = end;
9166 else
9167 offset = start;
9168
9169 if (PyUnicode_READ(kind_self, data_self, offset) ==
9170 PyUnicode_READ(kind_sub, data_sub, 0) &&
9171 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9172 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9173 /* If both are of the same kind, memcmp is sufficient */
9174 if (kind_self == kind_sub) {
9175 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009176 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 data_sub,
9178 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009179 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 }
9181 /* otherwise we have to compare each character by first accesing it */
9182 else {
9183 /* We do not need to compare 0 and len(substring)-1 because
9184 the if statement above ensured already that they are equal
9185 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009186 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 for (i = 1; i < end_sub; ++i) {
9188 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9189 PyUnicode_READ(kind_sub, data_sub, i))
9190 return 0;
9191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194 }
9195
9196 return 0;
9197}
9198
Alexander Belopolsky40018472011-02-26 01:02:56 +00009199Py_ssize_t
9200PyUnicode_Tailmatch(PyObject *str,
9201 PyObject *substr,
9202 Py_ssize_t start,
9203 Py_ssize_t end,
9204 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009206 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009207
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 str = PyUnicode_FromObject(str);
9209 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 substr = PyUnicode_FromObject(substr);
9212 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 Py_DECREF(str);
9214 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 }
Tim Petersced69f82003-09-16 20:30:58 +00009216
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009217 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 Py_DECREF(str);
9220 Py_DECREF(substr);
9221 return result;
9222}
9223
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224/* Apply fixfct filter to the Unicode object self and return a
9225 reference to the modified object */
9226
Alexander Belopolsky40018472011-02-26 01:02:56 +00009227static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009228fixup(PyObject *self,
9229 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 PyObject *u;
9232 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009233 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009235 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009238 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 /* fix functions return the new maximum character in a string,
9241 if the kind of the resulting unicode object does not change,
9242 everything is fine. Otherwise we need to change the string kind
9243 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009244 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009245
9246 if (maxchar_new == 0) {
9247 /* no changes */;
9248 if (PyUnicode_CheckExact(self)) {
9249 Py_DECREF(u);
9250 Py_INCREF(self);
9251 return self;
9252 }
9253 else
9254 return u;
9255 }
9256
Victor Stinnere6abb482012-05-02 01:15:40 +02009257 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258
Victor Stinnereaab6042011-12-11 22:22:39 +01009259 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009261
9262 /* In case the maximum character changed, we need to
9263 convert the string to the new category. */
9264 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9265 if (v == NULL) {
9266 Py_DECREF(u);
9267 return NULL;
9268 }
9269 if (maxchar_new > maxchar_old) {
9270 /* If the maxchar increased so that the kind changed, not all
9271 characters are representable anymore and we need to fix the
9272 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009273 _PyUnicode_FastCopyCharacters(v, 0,
9274 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009275 maxchar_old = fixfct(v);
9276 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 }
9278 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009279 _PyUnicode_FastCopyCharacters(v, 0,
9280 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009282 Py_DECREF(u);
9283 assert(_PyUnicode_CheckConsistency(v, 1));
9284 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285}
9286
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009287static PyObject *
9288ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9291 char *resdata, *data = PyUnicode_DATA(self);
9292 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009293
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009294 res = PyUnicode_New(len, 127);
9295 if (res == NULL)
9296 return NULL;
9297 resdata = PyUnicode_DATA(res);
9298 if (lower)
9299 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301 _Py_bytes_upper(resdata, data, len);
9302 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303}
9304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308 Py_ssize_t j;
9309 int final_sigma;
9310 Py_UCS4 c;
9311 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009312
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009313 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9314
9315 where ! is a negation and \p{xxx} is a character with property xxx.
9316 */
9317 for (j = i - 1; j >= 0; j--) {
9318 c = PyUnicode_READ(kind, data, j);
9319 if (!_PyUnicode_IsCaseIgnorable(c))
9320 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009322 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9323 if (final_sigma) {
9324 for (j = i + 1; j < length; j++) {
9325 c = PyUnicode_READ(kind, data, j);
9326 if (!_PyUnicode_IsCaseIgnorable(c))
9327 break;
9328 }
9329 final_sigma = j == length || !_PyUnicode_IsCased(c);
9330 }
9331 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332}
9333
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009334static int
9335lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9336 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009338 /* Obscure special case. */
9339 if (c == 0x3A3) {
9340 mapped[0] = handle_capital_sigma(kind, data, length, i);
9341 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009343 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344}
9345
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009346static Py_ssize_t
9347do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009349 Py_ssize_t i, k = 0;
9350 int n_res, j;
9351 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009352
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009353 c = PyUnicode_READ(kind, data, 0);
9354 n_res = _PyUnicode_ToUpperFull(c, mapped);
9355 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009356 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009357 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009359 for (i = 1; i < length; i++) {
9360 c = PyUnicode_READ(kind, data, i);
9361 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9362 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009363 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009364 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009365 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009366 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009367 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368}
9369
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009370static Py_ssize_t
9371do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9372 Py_ssize_t i, k = 0;
9373
9374 for (i = 0; i < length; i++) {
9375 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9376 int n_res, j;
9377 if (Py_UNICODE_ISUPPER(c)) {
9378 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9379 }
9380 else if (Py_UNICODE_ISLOWER(c)) {
9381 n_res = _PyUnicode_ToUpperFull(c, mapped);
9382 }
9383 else {
9384 n_res = 1;
9385 mapped[0] = c;
9386 }
9387 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009388 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009389 res[k++] = mapped[j];
9390 }
9391 }
9392 return k;
9393}
9394
9395static Py_ssize_t
9396do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9397 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009399 Py_ssize_t i, k = 0;
9400
9401 for (i = 0; i < length; i++) {
9402 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9403 int n_res, j;
9404 if (lower)
9405 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9406 else
9407 n_res = _PyUnicode_ToUpperFull(c, mapped);
9408 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009409 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009410 res[k++] = mapped[j];
9411 }
9412 }
9413 return k;
9414}
9415
9416static Py_ssize_t
9417do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9418{
9419 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9420}
9421
9422static Py_ssize_t
9423do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9424{
9425 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9426}
9427
Benjamin Petersone51757f2012-01-12 21:10:29 -05009428static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009429do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9430{
9431 Py_ssize_t i, k = 0;
9432
9433 for (i = 0; i < length; i++) {
9434 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9435 Py_UCS4 mapped[3];
9436 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9437 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009438 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009439 res[k++] = mapped[j];
9440 }
9441 }
9442 return k;
9443}
9444
9445static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009446do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9447{
9448 Py_ssize_t i, k = 0;
9449 int previous_is_cased;
9450
9451 previous_is_cased = 0;
9452 for (i = 0; i < length; i++) {
9453 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9454 Py_UCS4 mapped[3];
9455 int n_res, j;
9456
9457 if (previous_is_cased)
9458 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9459 else
9460 n_res = _PyUnicode_ToTitleFull(c, mapped);
9461
9462 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009463 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009464 res[k++] = mapped[j];
9465 }
9466
9467 previous_is_cased = _PyUnicode_IsCased(c);
9468 }
9469 return k;
9470}
9471
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472static PyObject *
9473case_operation(PyObject *self,
9474 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9475{
9476 PyObject *res = NULL;
9477 Py_ssize_t length, newlength = 0;
9478 int kind, outkind;
9479 void *data, *outdata;
9480 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9481
Benjamin Petersoneea48462012-01-16 14:28:50 -05009482 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009483
9484 kind = PyUnicode_KIND(self);
9485 data = PyUnicode_DATA(self);
9486 length = PyUnicode_GET_LENGTH(self);
Antoine Pitroub6dc9b72014-10-15 23:14:53 +02009487 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009488 PyErr_SetString(PyExc_OverflowError, "string is too long");
9489 return NULL;
9490 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009491 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492 if (tmp == NULL)
9493 return PyErr_NoMemory();
9494 newlength = perform(kind, data, length, tmp, &maxchar);
9495 res = PyUnicode_New(newlength, maxchar);
9496 if (res == NULL)
9497 goto leave;
9498 tmpend = tmp + newlength;
9499 outdata = PyUnicode_DATA(res);
9500 outkind = PyUnicode_KIND(res);
9501 switch (outkind) {
9502 case PyUnicode_1BYTE_KIND:
9503 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9504 break;
9505 case PyUnicode_2BYTE_KIND:
9506 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9507 break;
9508 case PyUnicode_4BYTE_KIND:
9509 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9510 break;
9511 default:
9512 assert(0);
9513 break;
9514 }
9515 leave:
9516 PyMem_FREE(tmp);
9517 return res;
9518}
9519
Tim Peters8ce9f162004-08-27 01:49:32 +00009520PyObject *
9521PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009524 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009526 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009527 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9528 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009529 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009531 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009533 int use_memcpy;
9534 unsigned char *res_data = NULL, *sep_data = NULL;
9535 PyObject *last_obj;
9536 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009538 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009539 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009540 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009541 }
9542
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009543 /* NOTE: the following code can't call back into Python code,
9544 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009545 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009546
Tim Peters05eba1f2004-08-27 21:32:02 +00009547 seqlen = PySequence_Fast_GET_SIZE(fseq);
9548 /* If empty sequence, return u"". */
9549 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009550 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009551 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009552 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009553
Tim Peters05eba1f2004-08-27 21:32:02 +00009554 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009555 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009556 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009557 if (seqlen == 1) {
9558 if (PyUnicode_CheckExact(items[0])) {
9559 res = items[0];
9560 Py_INCREF(res);
9561 Py_DECREF(fseq);
9562 return res;
9563 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009564 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009565 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009566 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009567 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009568 /* Set up sep and seplen */
9569 if (separator == NULL) {
9570 /* fall back to a blank space separator */
9571 sep = PyUnicode_FromOrdinal(' ');
9572 if (!sep)
9573 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009574 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009575 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009576 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009577 else {
9578 if (!PyUnicode_Check(separator)) {
9579 PyErr_Format(PyExc_TypeError,
9580 "separator: expected str instance,"
9581 " %.80s found",
9582 Py_TYPE(separator)->tp_name);
9583 goto onError;
9584 }
9585 if (PyUnicode_READY(separator))
9586 goto onError;
9587 sep = separator;
9588 seplen = PyUnicode_GET_LENGTH(separator);
9589 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9590 /* inc refcount to keep this code path symmetric with the
9591 above case of a blank separator */
9592 Py_INCREF(sep);
9593 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009594 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009595 }
9596
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009597 /* There are at least two things to join, or else we have a subclass
9598 * of str in the sequence.
9599 * Do a pre-pass to figure out the total amount of space we'll
9600 * need (sz), and see whether all argument are strings.
9601 */
9602 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009603#ifdef Py_DEBUG
9604 use_memcpy = 0;
9605#else
9606 use_memcpy = 1;
9607#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009608 for (i = 0; i < seqlen; i++) {
9609 const Py_ssize_t old_sz = sz;
9610 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 if (!PyUnicode_Check(item)) {
9612 PyErr_Format(PyExc_TypeError,
9613 "sequence item %zd: expected str instance,"
9614 " %.80s found",
9615 i, Py_TYPE(item)->tp_name);
9616 goto onError;
9617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 if (PyUnicode_READY(item) == -1)
9619 goto onError;
9620 sz += PyUnicode_GET_LENGTH(item);
9621 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009622 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009623 if (i != 0)
9624 sz += seplen;
9625 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9626 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009628 goto onError;
9629 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009630 if (use_memcpy && last_obj != NULL) {
9631 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9632 use_memcpy = 0;
9633 }
9634 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009635 }
Tim Petersced69f82003-09-16 20:30:58 +00009636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009638 if (res == NULL)
9639 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009640
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009641 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009642#ifdef Py_DEBUG
9643 use_memcpy = 0;
9644#else
9645 if (use_memcpy) {
9646 res_data = PyUnicode_1BYTE_DATA(res);
9647 kind = PyUnicode_KIND(res);
9648 if (seplen != 0)
9649 sep_data = PyUnicode_1BYTE_DATA(sep);
9650 }
9651#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009653 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009654 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009655 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009656 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009657 if (use_memcpy) {
9658 Py_MEMCPY(res_data,
9659 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009660 kind * seplen);
9661 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009662 }
9663 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009664 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009665 res_offset += seplen;
9666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009668 itemlen = PyUnicode_GET_LENGTH(item);
9669 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 if (use_memcpy) {
9671 Py_MEMCPY(res_data,
9672 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009673 kind * itemlen);
9674 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009675 }
9676 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009677 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009678 res_offset += itemlen;
9679 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009680 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009681 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 if (use_memcpy)
9683 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009684 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009685 else
9686 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009687
Tim Peters05eba1f2004-08-27 21:32:02 +00009688 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009690 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009694 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009696 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697 return NULL;
9698}
9699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700#define FILL(kind, data, value, start, length) \
9701 do { \
9702 Py_ssize_t i_ = 0; \
9703 assert(kind != PyUnicode_WCHAR_KIND); \
9704 switch ((kind)) { \
9705 case PyUnicode_1BYTE_KIND: { \
9706 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009707 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 break; \
9709 } \
9710 case PyUnicode_2BYTE_KIND: { \
9711 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9712 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9713 break; \
9714 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009715 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9717 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9718 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009719 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 } \
9721 } \
9722 } while (0)
9723
Victor Stinnerd3f08822012-05-29 12:57:52 +02009724void
9725_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9726 Py_UCS4 fill_char)
9727{
9728 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9729 const void *data = PyUnicode_DATA(unicode);
9730 assert(PyUnicode_IS_READY(unicode));
9731 assert(unicode_modifiable(unicode));
9732 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9733 assert(start >= 0);
9734 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9735 FILL(kind, data, fill_char, start, length);
9736}
9737
Victor Stinner3fe55312012-01-04 00:33:50 +01009738Py_ssize_t
9739PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9740 Py_UCS4 fill_char)
9741{
9742 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009743
9744 if (!PyUnicode_Check(unicode)) {
9745 PyErr_BadInternalCall();
9746 return -1;
9747 }
9748 if (PyUnicode_READY(unicode) == -1)
9749 return -1;
9750 if (unicode_check_modifiable(unicode))
9751 return -1;
9752
Victor Stinnerd3f08822012-05-29 12:57:52 +02009753 if (start < 0) {
9754 PyErr_SetString(PyExc_IndexError, "string index out of range");
9755 return -1;
9756 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009757 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9758 PyErr_SetString(PyExc_ValueError,
9759 "fill character is bigger than "
9760 "the string maximum character");
9761 return -1;
9762 }
9763
9764 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9765 length = Py_MIN(maxlen, length);
9766 if (length <= 0)
9767 return 0;
9768
Victor Stinnerd3f08822012-05-29 12:57:52 +02009769 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009770 return length;
9771}
9772
Victor Stinner9310abb2011-10-05 00:59:23 +02009773static PyObject *
9774pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009775 Py_ssize_t left,
9776 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 PyObject *u;
9780 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009781 int kind;
9782 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783
9784 if (left < 0)
9785 left = 0;
9786 if (right < 0)
9787 right = 0;
9788
Victor Stinnerc4b49542011-12-11 22:44:26 +01009789 if (left == 0 && right == 0)
9790 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9793 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009794 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9795 return NULL;
9796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009798 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009800 if (!u)
9801 return NULL;
9802
9803 kind = PyUnicode_KIND(u);
9804 data = PyUnicode_DATA(u);
9805 if (left)
9806 FILL(kind, data, fill, 0, left);
9807 if (right)
9808 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009809 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009810 assert(_PyUnicode_CheckConsistency(u, 1));
9811 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812}
9813
Alexander Belopolsky40018472011-02-26 01:02:56 +00009814PyObject *
9815PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818
9819 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009820 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009821 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009822 if (PyUnicode_READY(string) == -1) {
9823 Py_DECREF(string);
9824 return NULL;
9825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826
Benjamin Petersonead6b532011-12-20 17:23:42 -06009827 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829 if (PyUnicode_IS_ASCII(string))
9830 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009831 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009832 PyUnicode_GET_LENGTH(string), keepends);
9833 else
9834 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009836 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 break;
9838 case PyUnicode_2BYTE_KIND:
9839 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009840 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 PyUnicode_GET_LENGTH(string), keepends);
9842 break;
9843 case PyUnicode_4BYTE_KIND:
9844 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009845 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 PyUnicode_GET_LENGTH(string), keepends);
9847 break;
9848 default:
9849 assert(0);
9850 list = 0;
9851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 Py_DECREF(string);
9853 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854}
9855
Alexander Belopolsky40018472011-02-26 01:02:56 +00009856static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009857split(PyObject *self,
9858 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009859 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 int kind1, kind2, kind;
9862 void *buf1, *buf2;
9863 Py_ssize_t len1, len2;
9864 PyObject* out;
9865
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009867 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (PyUnicode_READY(self) == -1)
9870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009873 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009875 if (PyUnicode_IS_ASCII(self))
9876 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009877 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009878 PyUnicode_GET_LENGTH(self), maxcount
9879 );
9880 else
9881 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009882 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009883 PyUnicode_GET_LENGTH(self), maxcount
9884 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 case PyUnicode_2BYTE_KIND:
9886 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009887 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 PyUnicode_GET_LENGTH(self), maxcount
9889 );
9890 case PyUnicode_4BYTE_KIND:
9891 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009892 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 PyUnicode_GET_LENGTH(self), maxcount
9894 );
9895 default:
9896 assert(0);
9897 return NULL;
9898 }
9899
9900 if (PyUnicode_READY(substring) == -1)
9901 return NULL;
9902
9903 kind1 = PyUnicode_KIND(self);
9904 kind2 = PyUnicode_KIND(substring);
9905 kind = kind1 > kind2 ? kind1 : kind2;
9906 buf1 = PyUnicode_DATA(self);
9907 buf2 = PyUnicode_DATA(substring);
9908 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009909 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (!buf1)
9911 return NULL;
9912 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 if (!buf2) {
9915 if (kind1 != kind) PyMem_Free(buf1);
9916 return NULL;
9917 }
9918 len1 = PyUnicode_GET_LENGTH(self);
9919 len2 = PyUnicode_GET_LENGTH(substring);
9920
Benjamin Petersonead6b532011-12-20 17:23:42 -06009921 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9924 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009926 else
9927 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 break;
9930 case PyUnicode_2BYTE_KIND:
9931 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 break;
9934 case PyUnicode_4BYTE_KIND:
9935 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009936 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 break;
9938 default:
9939 out = NULL;
9940 }
9941 if (kind1 != kind)
9942 PyMem_Free(buf1);
9943 if (kind2 != kind)
9944 PyMem_Free(buf2);
9945 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946}
9947
Alexander Belopolsky40018472011-02-26 01:02:56 +00009948static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009949rsplit(PyObject *self,
9950 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009951 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 int kind1, kind2, kind;
9954 void *buf1, *buf2;
9955 Py_ssize_t len1, len2;
9956 PyObject* out;
9957
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009958 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009959 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 if (PyUnicode_READY(self) == -1)
9962 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009965 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009967 if (PyUnicode_IS_ASCII(self))
9968 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009969 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009970 PyUnicode_GET_LENGTH(self), maxcount
9971 );
9972 else
9973 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009974 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009975 PyUnicode_GET_LENGTH(self), maxcount
9976 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 case PyUnicode_2BYTE_KIND:
9978 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009979 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 PyUnicode_GET_LENGTH(self), maxcount
9981 );
9982 case PyUnicode_4BYTE_KIND:
9983 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009984 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 PyUnicode_GET_LENGTH(self), maxcount
9986 );
9987 default:
9988 assert(0);
9989 return NULL;
9990 }
9991
9992 if (PyUnicode_READY(substring) == -1)
9993 return NULL;
9994
9995 kind1 = PyUnicode_KIND(self);
9996 kind2 = PyUnicode_KIND(substring);
9997 kind = kind1 > kind2 ? kind1 : kind2;
9998 buf1 = PyUnicode_DATA(self);
9999 buf2 = PyUnicode_DATA(substring);
10000 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010001 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (!buf1)
10003 return NULL;
10004 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010005 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 if (!buf2) {
10007 if (kind1 != kind) PyMem_Free(buf1);
10008 return NULL;
10009 }
10010 len1 = PyUnicode_GET_LENGTH(self);
10011 len2 = PyUnicode_GET_LENGTH(substring);
10012
Benjamin Petersonead6b532011-12-20 17:23:42 -060010013 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010015 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10016 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010018 else
10019 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010020 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 break;
10022 case PyUnicode_2BYTE_KIND:
10023 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010024 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 break;
10026 case PyUnicode_4BYTE_KIND:
10027 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010028 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 break;
10030 default:
10031 out = NULL;
10032 }
10033 if (kind1 != kind)
10034 PyMem_Free(buf1);
10035 if (kind2 != kind)
10036 PyMem_Free(buf2);
10037 return out;
10038}
10039
10040static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010041anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10042 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010044 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010046 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10047 return asciilib_find(buf1, len1, buf2, len2, offset);
10048 else
10049 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 case PyUnicode_2BYTE_KIND:
10051 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10052 case PyUnicode_4BYTE_KIND:
10053 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10054 }
10055 assert(0);
10056 return -1;
10057}
10058
10059static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010060anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10061 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010063 switch (kind) {
10064 case PyUnicode_1BYTE_KIND:
10065 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10066 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10067 else
10068 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10069 case PyUnicode_2BYTE_KIND:
10070 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10071 case PyUnicode_4BYTE_KIND:
10072 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10073 }
10074 assert(0);
10075 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010076}
10077
Alexander Belopolsky40018472011-02-26 01:02:56 +000010078static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079replace(PyObject *self, PyObject *str1,
10080 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 PyObject *u;
10083 char *sbuf = PyUnicode_DATA(self);
10084 char *buf1 = PyUnicode_DATA(str1);
10085 char *buf2 = PyUnicode_DATA(str2);
10086 int srelease = 0, release1 = 0, release2 = 0;
10087 int skind = PyUnicode_KIND(self);
10088 int kind1 = PyUnicode_KIND(str1);
10089 int kind2 = PyUnicode_KIND(str2);
10090 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10091 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10092 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010093 int mayshrink;
10094 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095
10096 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010099 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Victor Stinner59de0ee2011-10-07 10:01:28 +020010101 if (str1 == str2)
10102 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (skind < kind1)
10104 /* substring too wide to be present */
10105 goto nothing;
10106
Victor Stinner49a0a212011-10-12 23:46:10 +020010107 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10108 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10109 /* Replacing str1 with str2 may cause a maxchar reduction in the
10110 result string. */
10111 mayshrink = (maxchar_str2 < maxchar);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010112 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010115 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010117 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010120 Py_UCS4 u1, u2;
10121 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010122 Py_ssize_t index, pos;
10123 char *src;
10124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010126 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10127 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010128 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010131 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010133 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010135
10136 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10137 index = 0;
10138 src = sbuf;
10139 while (--maxcount)
10140 {
10141 pos++;
10142 src += pos * PyUnicode_KIND(self);
10143 slen -= pos;
10144 index += pos;
10145 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10146 if (pos < 0)
10147 break;
10148 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10149 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010150 }
10151 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 int rkind = skind;
10153 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010154 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (kind1 < rkind) {
10157 /* widen substring */
10158 buf1 = _PyUnicode_AsKind(str1, rkind);
10159 if (!buf1) goto error;
10160 release1 = 1;
10161 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010162 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 if (i < 0)
10164 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 if (rkind > kind2) {
10166 /* widen replacement */
10167 buf2 = _PyUnicode_AsKind(str2, rkind);
10168 if (!buf2) goto error;
10169 release2 = 1;
10170 }
10171 else if (rkind < kind2) {
10172 /* widen self and buf1 */
10173 rkind = kind2;
10174 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010175 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 sbuf = _PyUnicode_AsKind(self, rkind);
10177 if (!sbuf) goto error;
10178 srelease = 1;
10179 buf1 = _PyUnicode_AsKind(str1, rkind);
10180 if (!buf1) goto error;
10181 release1 = 1;
10182 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010183 u = PyUnicode_New(slen, maxchar);
10184 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010186 assert(PyUnicode_KIND(u) == rkind);
10187 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010188
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010189 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010190 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010195
10196 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010198 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010200 if (i == -1)
10201 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010202 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010204 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 }
10209 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010211 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 int rkind = skind;
10213 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 buf1 = _PyUnicode_AsKind(str1, rkind);
10218 if (!buf1) goto error;
10219 release1 = 1;
10220 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 if (n == 0)
10223 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010225 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 buf2 = _PyUnicode_AsKind(str2, rkind);
10227 if (!buf2) goto error;
10228 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010231 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 rkind = kind2;
10233 sbuf = _PyUnicode_AsKind(self, rkind);
10234 if (!sbuf) goto error;
10235 srelease = 1;
10236 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010237 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 buf1 = _PyUnicode_AsKind(str1, rkind);
10239 if (!buf1) goto error;
10240 release1 = 1;
10241 }
10242 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10243 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010244 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 PyErr_SetString(PyExc_OverflowError,
10246 "replace string is too long");
10247 goto error;
10248 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010249 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010251 _Py_INCREF_UNICODE_EMPTY();
10252 if (!unicode_empty)
10253 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010254 u = unicode_empty;
10255 goto done;
10256 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010257 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 PyErr_SetString(PyExc_OverflowError,
10259 "replace string is too long");
10260 goto error;
10261 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010262 u = PyUnicode_New(new_size, maxchar);
10263 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010265 assert(PyUnicode_KIND(u) == rkind);
10266 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 ires = i = 0;
10268 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 while (n-- > 0) {
10270 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010271 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010272 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010274 if (j == -1)
10275 break;
10276 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010278 memcpy(res + rkind * ires,
10279 sbuf + rkind * i,
10280 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010282 }
10283 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010285 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010287 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010294 memcpy(res + rkind * ires,
10295 sbuf + rkind * i,
10296 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010297 }
10298 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 /* interleave */
10300 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010301 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010303 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 if (--n <= 0)
10306 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010307 memcpy(res + rkind * ires,
10308 sbuf + rkind * i,
10309 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 ires++;
10311 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010313 memcpy(res + rkind * ires,
10314 sbuf + rkind * i,
10315 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010317 }
10318
10319 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010320 unicode_adjust_maxchar(&u);
10321 if (u == NULL)
10322 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010324
10325 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (srelease)
10327 PyMem_FREE(sbuf);
10328 if (release1)
10329 PyMem_FREE(buf1);
10330 if (release2)
10331 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010332 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010336 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (srelease)
10338 PyMem_FREE(sbuf);
10339 if (release1)
10340 PyMem_FREE(buf1);
10341 if (release2)
10342 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010343 return unicode_result_unchanged(self);
10344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 error:
10346 if (srelease && sbuf)
10347 PyMem_FREE(sbuf);
10348 if (release1 && buf1)
10349 PyMem_FREE(buf1);
10350 if (release2 && buf2)
10351 PyMem_FREE(buf2);
10352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353}
10354
10355/* --- Unicode Object Methods --------------------------------------------- */
10356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010357PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359\n\
10360Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010361characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362
10363static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010364unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010366 if (PyUnicode_READY(self) == -1)
10367 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010368 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369}
10370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010371PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373\n\
10374Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010375have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
10377static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010378unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010380 if (PyUnicode_READY(self) == -1)
10381 return NULL;
10382 if (PyUnicode_GET_LENGTH(self) == 0)
10383 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010384 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385}
10386
Benjamin Petersond5890c82012-01-14 13:23:30 -050010387PyDoc_STRVAR(casefold__doc__,
10388 "S.casefold() -> str\n\
10389\n\
10390Return a version of S suitable for caseless comparisons.");
10391
10392static PyObject *
10393unicode_casefold(PyObject *self)
10394{
10395 if (PyUnicode_READY(self) == -1)
10396 return NULL;
10397 if (PyUnicode_IS_ASCII(self))
10398 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010399 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010400}
10401
10402
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010403/* Argument converter. Coerces to a single unicode character */
10404
10405static int
10406convert_uc(PyObject *obj, void *addr)
10407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010409 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010410
Benjamin Peterson14339b62009-01-31 16:36:08 +000010411 uniobj = PyUnicode_FromObject(obj);
10412 if (uniobj == NULL) {
10413 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010415 return 0;
10416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010418 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010420 Py_DECREF(uniobj);
10421 return 0;
10422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010424 Py_DECREF(uniobj);
10425 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010426}
10427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010428PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010431Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010432done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433
10434static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010435unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010437 Py_ssize_t marg, left;
10438 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 Py_UCS4 fillchar = ' ';
10440
Victor Stinnere9a29352011-10-01 02:14:59 +020010441 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443
Benjamin Petersonbac79492012-01-14 13:34:47 -050010444 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 return NULL;
10446
Victor Stinnerc4b49542011-12-11 22:44:26 +010010447 if (PyUnicode_GET_LENGTH(self) >= width)
10448 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449
Victor Stinnerc4b49542011-12-11 22:44:26 +010010450 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451 left = marg / 2 + (marg & width & 1);
10452
Victor Stinner9310abb2011-10-05 00:59:23 +020010453 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454}
10455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456/* This function assumes that str1 and str2 are readied by the caller. */
10457
Marc-André Lemburge5034372000-08-08 08:04:29 +000010458static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010459unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 int kind1, kind2;
10462 void *data1, *data2;
10463 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 kind1 = PyUnicode_KIND(str1);
10466 kind2 = PyUnicode_KIND(str2);
10467 data1 = PyUnicode_DATA(str1);
10468 data2 = PyUnicode_DATA(str2);
10469 len1 = PyUnicode_GET_LENGTH(str1);
10470 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 for (i = 0; i < len1 && i < len2; ++i) {
10473 Py_UCS4 c1, c2;
10474 c1 = PyUnicode_READ(kind1, data1, i);
10475 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010476
10477 if (c1 != c2)
10478 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010479 }
10480
10481 return (len1 < len2) ? -1 : (len1 != len2);
10482}
10483
Alexander Belopolsky40018472011-02-26 01:02:56 +000010484int
10485PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10488 if (PyUnicode_READY(left) == -1 ||
10489 PyUnicode_READY(right) == -1)
10490 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010491 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010493 PyErr_Format(PyExc_TypeError,
10494 "Can't compare %.100s and %.100s",
10495 left->ob_type->tp_name,
10496 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 return -1;
10498}
10499
Martin v. Löwis5b222132007-06-10 09:51:05 +000010500int
10501PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 Py_ssize_t i;
10504 int kind;
10505 void *data;
10506 Py_UCS4 chr;
10507
Victor Stinner910337b2011-10-03 03:20:16 +020010508 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 if (PyUnicode_READY(uni) == -1)
10510 return -1;
10511 kind = PyUnicode_KIND(uni);
10512 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010513 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10515 if (chr != str[i])
10516 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010517 /* This check keeps Python strings that end in '\0' from comparing equal
10518 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010521 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010523 return 0;
10524}
10525
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010526
Benjamin Peterson29060642009-01-31 22:14:21 +000010527#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010528 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010529
Alexander Belopolsky40018472011-02-26 01:02:56 +000010530PyObject *
10531PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010532{
10533 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010534
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010535 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10536 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (PyUnicode_READY(left) == -1 ||
10538 PyUnicode_READY(right) == -1)
10539 return NULL;
10540 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10541 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010542 if (op == Py_EQ) {
10543 Py_INCREF(Py_False);
10544 return Py_False;
10545 }
10546 if (op == Py_NE) {
10547 Py_INCREF(Py_True);
10548 return Py_True;
10549 }
10550 }
10551 if (left == right)
10552 result = 0;
10553 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010554 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010555
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010556 /* Convert the return value to a Boolean */
10557 switch (op) {
10558 case Py_EQ:
10559 v = TEST_COND(result == 0);
10560 break;
10561 case Py_NE:
10562 v = TEST_COND(result != 0);
10563 break;
10564 case Py_LE:
10565 v = TEST_COND(result <= 0);
10566 break;
10567 case Py_GE:
10568 v = TEST_COND(result >= 0);
10569 break;
10570 case Py_LT:
10571 v = TEST_COND(result == -1);
10572 break;
10573 case Py_GT:
10574 v = TEST_COND(result == 1);
10575 break;
10576 default:
10577 PyErr_BadArgument();
10578 return NULL;
10579 }
10580 Py_INCREF(v);
10581 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583
Brian Curtindfc80e32011-08-10 20:28:54 -050010584 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010585}
10586
Alexander Belopolsky40018472011-02-26 01:02:56 +000010587int
10588PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010589{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010590 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 int kind1, kind2, kind;
10592 void *buf1, *buf2;
10593 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010594 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010595
10596 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597 sub = PyUnicode_FromObject(element);
10598 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 PyErr_Format(PyExc_TypeError,
10600 "'in <string>' requires string as left operand, not %s",
10601 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010603 }
10604
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010606 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 Py_DECREF(sub);
10608 return -1;
10609 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010610 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10611 Py_DECREF(sub);
10612 Py_DECREF(str);
10613 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 kind1 = PyUnicode_KIND(str);
10616 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010617 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 buf1 = PyUnicode_DATA(str);
10619 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010620 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010621 if (kind2 > kind) {
10622 Py_DECREF(sub);
10623 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010624 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010625 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010626 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (!buf2) {
10629 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010630 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 return -1;
10632 }
10633 len1 = PyUnicode_GET_LENGTH(str);
10634 len2 = PyUnicode_GET_LENGTH(sub);
10635
Benjamin Petersonead6b532011-12-20 17:23:42 -060010636 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 case PyUnicode_1BYTE_KIND:
10638 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10639 break;
10640 case PyUnicode_2BYTE_KIND:
10641 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10642 break;
10643 case PyUnicode_4BYTE_KIND:
10644 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10645 break;
10646 default:
10647 result = -1;
10648 assert(0);
10649 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650
10651 Py_DECREF(str);
10652 Py_DECREF(sub);
10653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (kind2 != kind)
10655 PyMem_Free(buf2);
10656
Guido van Rossum403d68b2000-03-13 15:55:09 +000010657 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010658}
10659
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660/* Concat to string or Unicode object giving a new Unicode object. */
10661
Alexander Belopolsky40018472011-02-26 01:02:56 +000010662PyObject *
10663PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010666 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010667 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668
10669 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010672 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676
10677 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010678 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010679 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010682 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010683 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685 }
10686
Victor Stinner488fa492011-12-12 00:01:39 +010010687 u_len = PyUnicode_GET_LENGTH(u);
10688 v_len = PyUnicode_GET_LENGTH(v);
10689 if (u_len > PY_SSIZE_T_MAX - v_len) {
10690 PyErr_SetString(PyExc_OverflowError,
10691 "strings are too large to concat");
10692 goto onError;
10693 }
10694 new_len = u_len + v_len;
10695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010697 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010698 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010701 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010704 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10705 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706 Py_DECREF(u);
10707 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010708 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 Py_XDECREF(u);
10713 Py_XDECREF(v);
10714 return NULL;
10715}
10716
Walter Dörwald1ab83302007-05-18 17:15:44 +000010717void
Victor Stinner23e56682011-10-03 03:54:37 +020010718PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010719{
Victor Stinner23e56682011-10-03 03:54:37 +020010720 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010721 Py_UCS4 maxchar, maxchar2;
10722 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010723
10724 if (p_left == NULL) {
10725 if (!PyErr_Occurred())
10726 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010727 return;
10728 }
Victor Stinner23e56682011-10-03 03:54:37 +020010729 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010730 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010731 if (!PyErr_Occurred())
10732 PyErr_BadInternalCall();
10733 goto error;
10734 }
10735
Benjamin Petersonbac79492012-01-14 13:34:47 -050010736 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010737 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010738 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010739 goto error;
10740
Victor Stinner488fa492011-12-12 00:01:39 +010010741 /* Shortcuts */
10742 if (left == unicode_empty) {
10743 Py_DECREF(left);
10744 Py_INCREF(right);
10745 *p_left = right;
10746 return;
10747 }
10748 if (right == unicode_empty)
10749 return;
10750
10751 left_len = PyUnicode_GET_LENGTH(left);
10752 right_len = PyUnicode_GET_LENGTH(right);
10753 if (left_len > PY_SSIZE_T_MAX - right_len) {
10754 PyErr_SetString(PyExc_OverflowError,
10755 "strings are too large to concat");
10756 goto error;
10757 }
10758 new_len = left_len + right_len;
10759
10760 if (unicode_modifiable(left)
10761 && PyUnicode_CheckExact(right)
10762 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010763 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10764 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010765 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010766 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010767 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10768 {
10769 /* append inplace */
10770 if (unicode_resize(p_left, new_len) != 0) {
10771 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10772 * deallocated so it cannot be put back into
10773 * 'variable'. The MemoryError is raised when there
10774 * is no value in 'variable', which might (very
10775 * remotely) be a cause of incompatibilities.
10776 */
10777 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010778 }
Victor Stinner488fa492011-12-12 00:01:39 +010010779 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010780 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010781 }
Victor Stinner488fa492011-12-12 00:01:39 +010010782 else {
10783 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10784 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010785 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010786
Victor Stinner488fa492011-12-12 00:01:39 +010010787 /* Concat the two Unicode strings */
10788 res = PyUnicode_New(new_len, maxchar);
10789 if (res == NULL)
10790 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010791 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10792 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010793 Py_DECREF(left);
10794 *p_left = res;
10795 }
10796 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010797 return;
10798
10799error:
Victor Stinner488fa492011-12-12 00:01:39 +010010800 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010801}
10802
10803void
10804PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10805{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010806 PyUnicode_Append(pleft, right);
10807 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010808}
10809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010810PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010813Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010814string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010818unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010821 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010822 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 int kind1, kind2, kind;
10825 void *buf1, *buf2;
10826 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
Jesus Ceaac451502011-04-20 17:09:23 +020010828 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10829 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010830 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 kind1 = PyUnicode_KIND(self);
10833 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010834 if (kind2 > kind1)
10835 return PyLong_FromLong(0);
10836 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 buf1 = PyUnicode_DATA(self);
10838 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010840 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (!buf2) {
10842 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 return NULL;
10844 }
10845 len1 = PyUnicode_GET_LENGTH(self);
10846 len2 = PyUnicode_GET_LENGTH(substring);
10847
10848 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010849 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 case PyUnicode_1BYTE_KIND:
10851 iresult = ucs1lib_count(
10852 ((Py_UCS1*)buf1) + start, end - start,
10853 buf2, len2, PY_SSIZE_T_MAX
10854 );
10855 break;
10856 case PyUnicode_2BYTE_KIND:
10857 iresult = ucs2lib_count(
10858 ((Py_UCS2*)buf1) + start, end - start,
10859 buf2, len2, PY_SSIZE_T_MAX
10860 );
10861 break;
10862 case PyUnicode_4BYTE_KIND:
10863 iresult = ucs4lib_count(
10864 ((Py_UCS4*)buf1) + start, end - start,
10865 buf2, len2, PY_SSIZE_T_MAX
10866 );
10867 break;
10868 default:
10869 assert(0); iresult = 0;
10870 }
10871
10872 result = PyLong_FromSsize_t(iresult);
10873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (kind2 != kind)
10875 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
10877 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010878
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879 return result;
10880}
10881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010882PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010883 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010885Encode S using the codec registered for encoding. Default encoding\n\
10886is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010887handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010888a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10889'xmlcharrefreplace' as well as any other name registered with\n\
10890codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891
10892static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010893unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010895 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 char *encoding = NULL;
10897 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010898
Benjamin Peterson308d6372009-09-18 21:42:35 +000010899 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10900 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010902 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010903}
10904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907\n\
10908Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010909If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910
10911static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010912unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010914 Py_ssize_t i, j, line_pos, src_len, incr;
10915 Py_UCS4 ch;
10916 PyObject *u;
10917 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010919 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010920 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
10922 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
Antoine Pitrou22425222011-10-04 19:10:51 +020010925 if (PyUnicode_READY(self) == -1)
10926 return NULL;
10927
Thomas Wouters7e474022000-07-16 12:04:32 +000010928 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010929 src_len = PyUnicode_GET_LENGTH(self);
10930 i = j = line_pos = 0;
10931 kind = PyUnicode_KIND(self);
10932 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010933 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010934 for (; i < src_len; i++) {
10935 ch = PyUnicode_READ(kind, src_data, i);
10936 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010937 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010939 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010941 goto overflow;
10942 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010944 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010948 goto overflow;
10949 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010951 if (ch == '\n' || ch == '\r')
10952 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010954 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010955 if (!found)
10956 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010957
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010959 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 if (!u)
10961 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010962 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963
Antoine Pitroue71d5742011-10-04 15:55:09 +020010964 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
Antoine Pitroue71d5742011-10-04 15:55:09 +020010966 for (; i < src_len; i++) {
10967 ch = PyUnicode_READ(kind, src_data, i);
10968 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 incr = tabsize - (line_pos % tabsize);
10971 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010972 FILL(kind, dest_data, ' ', j, incr);
10973 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010975 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 line_pos++;
10978 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010979 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010980 if (ch == '\n' || ch == '\r')
10981 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010983 }
10984 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010985 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010986
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010988 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990}
10991
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010992PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994\n\
10995Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010996such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997arguments start and end are interpreted as in slice notation.\n\
10998\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010999Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
11001static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011004 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011005 Py_ssize_t start;
11006 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011007 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008
Jesus Ceaac451502011-04-20 17:09:23 +020011009 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11010 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 if (PyUnicode_READY(self) == -1)
11014 return NULL;
11015 if (PyUnicode_READY(substring) == -1)
11016 return NULL;
11017
Victor Stinner7931d9a2011-11-04 00:22:48 +010011018 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
11020 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 if (result == -2)
11023 return NULL;
11024
Christian Heimes217cfd12007-12-02 14:31:20 +000011025 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026}
11027
11028static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011029unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011031 void *data;
11032 enum PyUnicode_Kind kind;
11033 Py_UCS4 ch;
11034 PyObject *res;
11035
11036 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11037 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011039 }
11040 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11041 PyErr_SetString(PyExc_IndexError, "string index out of range");
11042 return NULL;
11043 }
11044 kind = PyUnicode_KIND(self);
11045 data = PyUnicode_DATA(self);
11046 ch = PyUnicode_READ(kind, data, index);
11047 if (ch < 256)
11048 return get_latin1_char(ch);
11049
11050 res = PyUnicode_New(1, ch);
11051 if (res == NULL)
11052 return NULL;
11053 kind = PyUnicode_KIND(res);
11054 data = PyUnicode_DATA(res);
11055 PyUnicode_WRITE(kind, data, 0, ch);
11056 assert(_PyUnicode_CheckConsistency(res, 1));
11057 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058}
11059
Guido van Rossumc2504932007-09-18 19:42:40 +000011060/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011061 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011062static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011063unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064{
Guido van Rossumc2504932007-09-18 19:42:40 +000011065 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011066 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011067
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011068#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011069 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011070#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (_PyUnicode_HASH(self) != -1)
11072 return _PyUnicode_HASH(self);
11073 if (PyUnicode_READY(self) == -1)
11074 return -1;
11075 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011076 /*
11077 We make the hash of the empty string be 0, rather than using
11078 (prefix ^ suffix), since this slightly obfuscates the hash secret
11079 */
11080 if (len == 0) {
11081 _PyUnicode_HASH(self) = 0;
11082 return 0;
11083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084
11085 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011086#define HASH(P) \
11087 x ^= (Py_uhash_t) *P << 7; \
11088 while (--len >= 0) \
11089 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090
Georg Brandl2fb477c2012-02-21 00:33:36 +010011091 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 switch (PyUnicode_KIND(self)) {
11093 case PyUnicode_1BYTE_KIND: {
11094 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11095 HASH(c);
11096 break;
11097 }
11098 case PyUnicode_2BYTE_KIND: {
11099 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11100 HASH(s);
11101 break;
11102 }
11103 default: {
11104 Py_UCS4 *l;
11105 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11106 "Impossible switch case in unicode_hash");
11107 l = PyUnicode_4BYTE_DATA(self);
11108 HASH(l);
11109 break;
11110 }
11111 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011112 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11113 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114
Guido van Rossumc2504932007-09-18 19:42:40 +000011115 if (x == -1)
11116 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011118 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011122PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011123 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
11127static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011130 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011131 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011132 Py_ssize_t start;
11133 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Jesus Ceaac451502011-04-20 17:09:23 +020011135 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11136 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 if (PyUnicode_READY(self) == -1)
11140 return NULL;
11141 if (PyUnicode_READY(substring) == -1)
11142 return NULL;
11143
Victor Stinner7931d9a2011-11-04 00:22:48 +010011144 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145
11146 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if (result == -2)
11149 return NULL;
11150
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151 if (result < 0) {
11152 PyErr_SetString(PyExc_ValueError, "substring not found");
11153 return NULL;
11154 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011155
Christian Heimes217cfd12007-12-02 14:31:20 +000011156 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157}
11158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011159PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011160 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011162Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011163at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164
11165static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011166unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 Py_ssize_t i, length;
11169 int kind;
11170 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171 int cased;
11172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (PyUnicode_READY(self) == -1)
11174 return NULL;
11175 length = PyUnicode_GET_LENGTH(self);
11176 kind = PyUnicode_KIND(self);
11177 data = PyUnicode_DATA(self);
11178
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (length == 1)
11181 return PyBool_FromLong(
11182 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011184 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011187
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 for (i = 0; i < length; i++) {
11190 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011191
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11193 return PyBool_FromLong(0);
11194 else if (!cased && Py_UNICODE_ISLOWER(ch))
11195 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011197 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198}
11199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011200PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011203Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011204at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205
11206static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011207unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 Py_ssize_t i, length;
11210 int kind;
11211 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 int cased;
11213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (PyUnicode_READY(self) == -1)
11215 return NULL;
11216 length = PyUnicode_GET_LENGTH(self);
11217 kind = PyUnicode_KIND(self);
11218 data = PyUnicode_DATA(self);
11219
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 if (length == 1)
11222 return PyBool_FromLong(
11223 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011225 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011228
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 for (i = 0; i < length; i++) {
11231 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011232
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11234 return PyBool_FromLong(0);
11235 else if (!cased && Py_UNICODE_ISUPPER(ch))
11236 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011238 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239}
11240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011244Return True if S is a titlecased string and there is at least one\n\
11245character in S, i.e. upper- and titlecase characters may only\n\
11246follow uncased characters and lowercase characters only cased ones.\n\
11247Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
11249static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011250unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 Py_ssize_t i, length;
11253 int kind;
11254 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 int cased, previous_is_cased;
11256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 if (PyUnicode_READY(self) == -1)
11258 return NULL;
11259 length = PyUnicode_GET_LENGTH(self);
11260 kind = PyUnicode_KIND(self);
11261 data = PyUnicode_DATA(self);
11262
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 if (length == 1) {
11265 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11266 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11267 (Py_UNICODE_ISUPPER(ch) != 0));
11268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011270 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011273
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 cased = 0;
11275 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 for (i = 0; i < length; i++) {
11277 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011278
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11280 if (previous_is_cased)
11281 return PyBool_FromLong(0);
11282 previous_is_cased = 1;
11283 cased = 1;
11284 }
11285 else if (Py_UNICODE_ISLOWER(ch)) {
11286 if (!previous_is_cased)
11287 return PyBool_FromLong(0);
11288 previous_is_cased = 1;
11289 cased = 1;
11290 }
11291 else
11292 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011294 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295}
11296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011300Return True if all characters in S are whitespace\n\
11301and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
11303static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011304unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 Py_ssize_t i, length;
11307 int kind;
11308 void *data;
11309
11310 if (PyUnicode_READY(self) == -1)
11311 return NULL;
11312 length = PyUnicode_GET_LENGTH(self);
11313 kind = PyUnicode_KIND(self);
11314 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 if (length == 1)
11318 return PyBool_FromLong(
11319 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011321 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 for (i = 0; i < length; i++) {
11326 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011327 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011328 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011330 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331}
11332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011333PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011335\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011336Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011337and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011338
11339static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011340unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 Py_ssize_t i, length;
11343 int kind;
11344 void *data;
11345
11346 if (PyUnicode_READY(self) == -1)
11347 return NULL;
11348 length = PyUnicode_GET_LENGTH(self);
11349 kind = PyUnicode_KIND(self);
11350 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011351
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011352 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (length == 1)
11354 return PyBool_FromLong(
11355 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011356
11357 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 for (i = 0; i < length; i++) {
11362 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011365 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011366}
11367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011368PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011370\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011371Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011372and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011373
11374static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011375unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 int kind;
11378 void *data;
11379 Py_ssize_t len, i;
11380
11381 if (PyUnicode_READY(self) == -1)
11382 return NULL;
11383
11384 kind = PyUnicode_KIND(self);
11385 data = PyUnicode_DATA(self);
11386 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011388 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 if (len == 1) {
11390 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11391 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11392 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011393
11394 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 for (i = 0; i < len; i++) {
11399 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011400 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011402 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011403 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404}
11405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011409Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
11412static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011413unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 Py_ssize_t i, length;
11416 int kind;
11417 void *data;
11418
11419 if (PyUnicode_READY(self) == -1)
11420 return NULL;
11421 length = PyUnicode_GET_LENGTH(self);
11422 kind = PyUnicode_KIND(self);
11423 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (length == 1)
11427 return PyBool_FromLong(
11428 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011430 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 for (i = 0; i < length; i++) {
11435 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011438 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439}
11440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011444Return True if all characters in S are digits\n\
11445and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
11447static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011448unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 Py_ssize_t i, length;
11451 int kind;
11452 void *data;
11453
11454 if (PyUnicode_READY(self) == -1)
11455 return NULL;
11456 length = PyUnicode_GET_LENGTH(self);
11457 kind = PyUnicode_KIND(self);
11458 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 if (length == 1) {
11462 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11463 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011466 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 for (i = 0; i < length; i++) {
11471 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011474 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475}
11476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011477PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011480Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011481False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
11483static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011484unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 Py_ssize_t i, length;
11487 int kind;
11488 void *data;
11489
11490 if (PyUnicode_READY(self) == -1)
11491 return NULL;
11492 length = PyUnicode_GET_LENGTH(self);
11493 kind = PyUnicode_KIND(self);
11494 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (length == 1)
11498 return PyBool_FromLong(
11499 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011501 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 for (i = 0; i < length; i++) {
11506 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011509 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510}
11511
Martin v. Löwis47383402007-08-15 07:32:56 +000011512int
11513PyUnicode_IsIdentifier(PyObject *self)
11514{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 int kind;
11516 void *data;
11517 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011518 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (PyUnicode_READY(self) == -1) {
11521 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 }
11524
11525 /* Special case for empty strings */
11526 if (PyUnicode_GET_LENGTH(self) == 0)
11527 return 0;
11528 kind = PyUnicode_KIND(self);
11529 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011530
11531 /* PEP 3131 says that the first character must be in
11532 XID_Start and subsequent characters in XID_Continue,
11533 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011534 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011535 letters, digits, underscore). However, given the current
11536 definition of XID_Start and XID_Continue, it is sufficient
11537 to check just for these, except that _ must be allowed
11538 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011540 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011541 return 0;
11542
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011543 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011546 return 1;
11547}
11548
11549PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011551\n\
11552Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011553to the language definition.\n\
11554\n\
11555Use keyword.iskeyword() to test for reserved identifiers\n\
11556such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011557
11558static PyObject*
11559unicode_isidentifier(PyObject *self)
11560{
11561 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11562}
11563
Georg Brandl559e5d72008-06-11 18:37:52 +000011564PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011566\n\
11567Return True if all characters in S are considered\n\
11568printable in repr() or S is empty, False otherwise.");
11569
11570static PyObject*
11571unicode_isprintable(PyObject *self)
11572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 Py_ssize_t i, length;
11574 int kind;
11575 void *data;
11576
11577 if (PyUnicode_READY(self) == -1)
11578 return NULL;
11579 length = PyUnicode_GET_LENGTH(self);
11580 kind = PyUnicode_KIND(self);
11581 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011582
11583 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (length == 1)
11585 return PyBool_FromLong(
11586 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 for (i = 0; i < length; i++) {
11589 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011590 Py_RETURN_FALSE;
11591 }
11592 }
11593 Py_RETURN_TRUE;
11594}
11595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011596PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011597 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598\n\
11599Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011600iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011603unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011605 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
Martin v. Löwis18e16552006-02-15 17:27:45 +000011608static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011609unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 if (PyUnicode_READY(self) == -1)
11612 return -1;
11613 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011619Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011620done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
11622static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011623unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011625 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 Py_UCS4 fillchar = ' ';
11627
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011628 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 return NULL;
11630
Benjamin Petersonbac79492012-01-14 13:34:47 -050011631 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
Victor Stinnerc4b49542011-12-11 22:44:26 +010011634 if (PyUnicode_GET_LENGTH(self) >= width)
11635 return unicode_result_unchanged(self);
11636
11637 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638}
11639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011640PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011643Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
11645static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011646unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011648 if (PyUnicode_READY(self) == -1)
11649 return NULL;
11650 if (PyUnicode_IS_ASCII(self))
11651 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011652 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655#define LEFTSTRIP 0
11656#define RIGHTSTRIP 1
11657#define BOTHSTRIP 2
11658
11659/* Arrays indexed by above */
11660static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11661
11662#define STRIPNAME(i) (stripformat[i]+3)
11663
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664/* externally visible for str.strip(unicode) */
11665PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 void *data;
11669 int kind;
11670 Py_ssize_t i, j, len;
11671 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11674 return NULL;
11675
11676 kind = PyUnicode_KIND(self);
11677 data = PyUnicode_DATA(self);
11678 len = PyUnicode_GET_LENGTH(self);
11679 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11680 PyUnicode_DATA(sepobj),
11681 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011682
Benjamin Peterson14339b62009-01-31 16:36:08 +000011683 i = 0;
11684 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 while (i < len &&
11686 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 i++;
11688 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011689 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690
Benjamin Peterson14339b62009-01-31 16:36:08 +000011691 j = len;
11692 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 do {
11694 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 } while (j >= i &&
11696 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699
Victor Stinner7931d9a2011-11-04 00:22:48 +010011700 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701}
11702
11703PyObject*
11704PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11705{
11706 unsigned char *data;
11707 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011708 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709
Victor Stinnerde636f32011-10-01 03:55:54 +020011710 if (PyUnicode_READY(self) == -1)
11711 return NULL;
11712
Victor Stinner684d5fd2012-05-03 02:32:34 +020011713 length = PyUnicode_GET_LENGTH(self);
11714 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011715
Victor Stinner684d5fd2012-05-03 02:32:34 +020011716 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011717 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718
Victor Stinnerde636f32011-10-01 03:55:54 +020011719 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011720 PyErr_SetString(PyExc_IndexError, "string index out of range");
11721 return NULL;
11722 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011723 if (start >= length || end < start)
11724 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011725
Victor Stinner684d5fd2012-05-03 02:32:34 +020011726 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011727 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011728 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011729 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011730 }
11731 else {
11732 kind = PyUnicode_KIND(self);
11733 data = PyUnicode_1BYTE_DATA(self);
11734 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011735 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011736 length);
11737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
11740static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011741do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 int kind;
11744 void *data;
11745 Py_ssize_t len, i, j;
11746
11747 if (PyUnicode_READY(self) == -1)
11748 return NULL;
11749
11750 kind = PyUnicode_KIND(self);
11751 data = PyUnicode_DATA(self);
11752 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753
Benjamin Peterson14339b62009-01-31 16:36:08 +000011754 i = 0;
11755 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011757 i++;
11758 }
11759 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011760
Benjamin Peterson14339b62009-01-31 16:36:08 +000011761 j = len;
11762 if (striptype != LEFTSTRIP) {
11763 do {
11764 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011766 j++;
11767 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011768
Victor Stinner7931d9a2011-11-04 00:22:48 +010011769 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770}
11771
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772
11773static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011774do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011776 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011777
Benjamin Peterson14339b62009-01-31 16:36:08 +000011778 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11779 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781 if (sep != NULL && sep != Py_None) {
11782 if (PyUnicode_Check(sep))
11783 return _PyUnicode_XStrip(self, striptype, sep);
11784 else {
11785 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 "%s arg must be None or str",
11787 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 return NULL;
11789 }
11790 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011793}
11794
11795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011796PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798\n\
11799Return a copy of the string S with leading and trailing\n\
11800whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011801If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802
11803static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011805{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011806 if (PyTuple_GET_SIZE(args) == 0)
11807 return do_strip(self, BOTHSTRIP); /* Common case */
11808 else
11809 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810}
11811
11812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011813PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815\n\
11816Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011817If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
11819static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011820unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 if (PyTuple_GET_SIZE(args) == 0)
11823 return do_strip(self, LEFTSTRIP); /* Common case */
11824 else
11825 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011826}
11827
11828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011829PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831\n\
11832Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011833If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834
11835static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011836unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011838 if (PyTuple_GET_SIZE(args) == 0)
11839 return do_strip(self, RIGHTSTRIP); /* Common case */
11840 else
11841 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842}
11843
11844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011848 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850
Serhiy Storchaka05997252013-01-26 12:14:02 +020011851 if (len < 1)
11852 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853
Victor Stinnerc4b49542011-12-11 22:44:26 +010011854 /* no repeat, return original string */
11855 if (len == 1)
11856 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011857
Benjamin Petersonbac79492012-01-14 13:34:47 -050011858 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 return NULL;
11860
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011861 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011862 PyErr_SetString(PyExc_OverflowError,
11863 "repeated string is too long");
11864 return NULL;
11865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011867
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011868 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 if (!u)
11870 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011871 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (PyUnicode_GET_LENGTH(str) == 1) {
11874 const int kind = PyUnicode_KIND(str);
11875 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011876 if (kind == PyUnicode_1BYTE_KIND) {
11877 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011878 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011879 }
11880 else if (kind == PyUnicode_2BYTE_KIND) {
11881 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011882 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011883 ucs2[n] = fill_char;
11884 } else {
11885 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11886 assert(kind == PyUnicode_4BYTE_KIND);
11887 for (n = 0; n < len; ++n)
11888 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 }
11891 else {
11892 /* number of characters copied this far */
11893 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011894 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 char *to = (char *) PyUnicode_DATA(u);
11896 Py_MEMCPY(to, PyUnicode_DATA(str),
11897 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 n = (done <= nchars-done) ? done : nchars-done;
11900 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903 }
11904
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011905 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011906 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907}
11908
Alexander Belopolsky40018472011-02-26 01:02:56 +000011909PyObject *
11910PyUnicode_Replace(PyObject *obj,
11911 PyObject *subobj,
11912 PyObject *replobj,
11913 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914{
11915 PyObject *self;
11916 PyObject *str1;
11917 PyObject *str2;
11918 PyObject *result;
11919
11920 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011921 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011924 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 Py_DECREF(self);
11926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 }
11928 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011929 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 Py_DECREF(self);
11931 Py_DECREF(str1);
11932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011934 if (PyUnicode_READY(self) == -1 ||
11935 PyUnicode_READY(str1) == -1 ||
11936 PyUnicode_READY(str2) == -1)
11937 result = NULL;
11938 else
11939 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 Py_DECREF(self);
11941 Py_DECREF(str1);
11942 Py_DECREF(str2);
11943 return result;
11944}
11945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011946PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011947 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948\n\
11949Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011950old replaced by new. If the optional argument count is\n\
11951given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
11953static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 PyObject *str1;
11957 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011958 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 PyObject *result;
11960
Martin v. Löwis18e16552006-02-15 17:27:45 +000011961 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011963 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011966 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 return NULL;
11968 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011969 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 Py_DECREF(str1);
11971 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011972 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011973 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11974 result = NULL;
11975 else
11976 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
11978 Py_DECREF(str1);
11979 Py_DECREF(str2);
11980 return result;
11981}
11982
Alexander Belopolsky40018472011-02-26 01:02:56 +000011983static PyObject *
11984unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011986 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 Py_ssize_t isize;
11988 Py_ssize_t osize, squote, dquote, i, o;
11989 Py_UCS4 max, quote;
11990 int ikind, okind;
11991 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011994 return NULL;
11995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 isize = PyUnicode_GET_LENGTH(unicode);
11997 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 /* Compute length of output, quote characters, and
12000 maximum character */
12001 osize = 2; /* quotes */
12002 max = 127;
12003 squote = dquote = 0;
12004 ikind = PyUnicode_KIND(unicode);
12005 for (i = 0; i < isize; i++) {
12006 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012007 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012009 case '\'': squote++; break;
12010 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012012 incr = 2;
12013 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 default:
12015 /* Fast-path ASCII */
12016 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012017 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012019 ;
12020 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012023 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012025 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012027 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012029 if (osize > PY_SSIZE_T_MAX - incr) {
12030 PyErr_SetString(PyExc_OverflowError,
12031 "string is too long to generate repr");
12032 return NULL;
12033 }
12034 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 }
12036
12037 quote = '\'';
12038 if (squote) {
12039 if (dquote)
12040 /* Both squote and dquote present. Use squote,
12041 and escape them */
12042 osize += squote;
12043 else
12044 quote = '"';
12045 }
12046
12047 repr = PyUnicode_New(osize, max);
12048 if (repr == NULL)
12049 return NULL;
12050 okind = PyUnicode_KIND(repr);
12051 odata = PyUnicode_DATA(repr);
12052
12053 PyUnicode_WRITE(okind, odata, 0, quote);
12054 PyUnicode_WRITE(okind, odata, osize-1, quote);
12055
12056 for (i = 0, o = 1; i < isize; i++) {
12057 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012058
12059 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if ((ch == quote) || (ch == '\\')) {
12061 PyUnicode_WRITE(okind, odata, o++, '\\');
12062 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012063 continue;
12064 }
12065
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012067 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 PyUnicode_WRITE(okind, odata, o++, '\\');
12069 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012070 }
12071 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 PyUnicode_WRITE(okind, odata, o++, '\\');
12073 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012074 }
12075 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 PyUnicode_WRITE(okind, odata, o++, '\\');
12077 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012078 }
12079
12080 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012081 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 PyUnicode_WRITE(okind, odata, o++, '\\');
12083 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012086 }
12087
Georg Brandl559e5d72008-06-11 18:37:52 +000012088 /* Copy ASCII characters as-is */
12089 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012091 }
12092
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012095 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012096 (categories Z* and C* except ASCII space)
12097 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012099 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012100 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012106 /* Map 16-bit characters to '\uxxxx' */
12107 else if (ch <= 0xffff) {
12108 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012109 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012113 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012114 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012115 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012116 PyUnicode_WRITE(okind, odata, o++, 'U');
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012125 }
12126 }
12127 /* Copy characters as-is */
12128 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012130 }
12131 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012134 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012135 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012138PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140\n\
12141Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012142such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143arguments start and end are interpreted as in slice notation.\n\
12144\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012151 Py_ssize_t start;
12152 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012153 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Jesus Ceaac451502011-04-20 17:09:23 +020012155 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12156 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(self) == -1)
12160 return NULL;
12161 if (PyUnicode_READY(substring) == -1)
12162 return NULL;
12163
Victor Stinner7931d9a2011-11-04 00:22:48 +010012164 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
12166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 if (result == -2)
12169 return NULL;
12170
Christian Heimes217cfd12007-12-02 14:31:20 +000012171 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172}
12173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012174PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012177Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
12179static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012182 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012183 Py_ssize_t start;
12184 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012185 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Jesus Ceaac451502011-04-20 17:09:23 +020012187 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12188 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (PyUnicode_READY(self) == -1)
12192 return NULL;
12193 if (PyUnicode_READY(substring) == -1)
12194 return NULL;
12195
Victor Stinner7931d9a2011-11-04 00:22:48 +010012196 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
12198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if (result == -2)
12201 return NULL;
12202
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 if (result < 0) {
12204 PyErr_SetString(PyExc_ValueError, "substring not found");
12205 return NULL;
12206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207
Christian Heimes217cfd12007-12-02 14:31:20 +000012208 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209}
12210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012211PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012212 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012214Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012215done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
12217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012218unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012220 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 Py_UCS4 fillchar = ' ';
12222
Victor Stinnere9a29352011-10-01 02:14:59 +020012223 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012225
Benjamin Petersonbac79492012-01-14 13:34:47 -050012226 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 return NULL;
12228
Victor Stinnerc4b49542011-12-11 22:44:26 +010012229 if (PyUnicode_GET_LENGTH(self) >= width)
12230 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231
Victor Stinnerc4b49542011-12-11 22:44:26 +010012232 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
Alexander Belopolsky40018472011-02-26 01:02:56 +000012235PyObject *
12236PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
12238 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012239
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 s = PyUnicode_FromObject(s);
12241 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012242 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 if (sep != NULL) {
12244 sep = PyUnicode_FromObject(sep);
12245 if (sep == NULL) {
12246 Py_DECREF(s);
12247 return NULL;
12248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249 }
12250
Victor Stinner9310abb2011-10-05 00:59:23 +020012251 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
12253 Py_DECREF(s);
12254 Py_XDECREF(sep);
12255 return result;
12256}
12257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012258PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012259 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260\n\
12261Return a list of the words in S, using sep as the\n\
12262delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012263splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012264whitespace string is a separator and empty strings are\n\
12265removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
12267static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012268unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012270 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012272 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012274 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12275 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 return NULL;
12277
12278 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012281 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012283 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284}
12285
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286PyObject *
12287PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12288{
12289 PyObject* str_obj;
12290 PyObject* sep_obj;
12291 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 int kind1, kind2, kind;
12293 void *buf1 = NULL, *buf2 = NULL;
12294 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295
12296 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012297 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012300 if (!sep_obj) {
12301 Py_DECREF(str_obj);
12302 return NULL;
12303 }
12304 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12305 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306 Py_DECREF(str_obj);
12307 return NULL;
12308 }
12309
Victor Stinner14f8f022011-10-05 20:58:25 +020012310 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012312 kind = Py_MAX(kind1, kind2);
12313 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012315 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 if (!buf1)
12317 goto onError;
12318 buf2 = PyUnicode_DATA(sep_obj);
12319 if (kind2 != kind)
12320 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12321 if (!buf2)
12322 goto onError;
12323 len1 = PyUnicode_GET_LENGTH(str_obj);
12324 len2 = PyUnicode_GET_LENGTH(sep_obj);
12325
Benjamin Petersonead6b532011-12-20 17:23:42 -060012326 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012328 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12329 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12330 else
12331 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 break;
12333 case PyUnicode_2BYTE_KIND:
12334 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12335 break;
12336 case PyUnicode_4BYTE_KIND:
12337 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 break;
12339 default:
12340 assert(0);
12341 out = 0;
12342 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012343
12344 Py_DECREF(sep_obj);
12345 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 if (kind1 != kind)
12347 PyMem_Free(buf1);
12348 if (kind2 != kind)
12349 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350
12351 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 onError:
12353 Py_DECREF(sep_obj);
12354 Py_DECREF(str_obj);
12355 if (kind1 != kind && buf1)
12356 PyMem_Free(buf1);
12357 if (kind2 != kind && buf2)
12358 PyMem_Free(buf2);
12359 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012360}
12361
12362
12363PyObject *
12364PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12365{
12366 PyObject* str_obj;
12367 PyObject* sep_obj;
12368 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 int kind1, kind2, kind;
12370 void *buf1 = NULL, *buf2 = NULL;
12371 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372
12373 str_obj = PyUnicode_FromObject(str_in);
12374 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012376 sep_obj = PyUnicode_FromObject(sep_in);
12377 if (!sep_obj) {
12378 Py_DECREF(str_obj);
12379 return NULL;
12380 }
12381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 kind1 = PyUnicode_KIND(str_in);
12383 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012384 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 buf1 = PyUnicode_DATA(str_in);
12386 if (kind1 != kind)
12387 buf1 = _PyUnicode_AsKind(str_in, kind);
12388 if (!buf1)
12389 goto onError;
12390 buf2 = PyUnicode_DATA(sep_obj);
12391 if (kind2 != kind)
12392 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12393 if (!buf2)
12394 goto onError;
12395 len1 = PyUnicode_GET_LENGTH(str_obj);
12396 len2 = PyUnicode_GET_LENGTH(sep_obj);
12397
Benjamin Petersonead6b532011-12-20 17:23:42 -060012398 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012400 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12401 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402 else
12403 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 break;
12405 case PyUnicode_2BYTE_KIND:
12406 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12407 break;
12408 case PyUnicode_4BYTE_KIND:
12409 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12410 break;
12411 default:
12412 assert(0);
12413 out = 0;
12414 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012415
12416 Py_DECREF(sep_obj);
12417 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 if (kind1 != kind)
12419 PyMem_Free(buf1);
12420 if (kind2 != kind)
12421 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422
12423 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 onError:
12425 Py_DECREF(sep_obj);
12426 Py_DECREF(str_obj);
12427 if (kind1 != kind && buf1)
12428 PyMem_Free(buf1);
12429 if (kind2 != kind && buf2)
12430 PyMem_Free(buf2);
12431 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432}
12433
12434PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012437Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012439found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440
12441static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012442unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443{
Victor Stinner9310abb2011-10-05 00:59:23 +020012444 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445}
12446
12447PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012448 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012450Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012452separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453
12454static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012455unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456{
Victor Stinner9310abb2011-10-05 00:59:23 +020012457 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458}
12459
Alexander Belopolsky40018472011-02-26 01:02:56 +000012460PyObject *
12461PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462{
12463 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465 s = PyUnicode_FromObject(s);
12466 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 if (sep != NULL) {
12469 sep = PyUnicode_FromObject(sep);
12470 if (sep == NULL) {
12471 Py_DECREF(s);
12472 return NULL;
12473 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012474 }
12475
Victor Stinner9310abb2011-10-05 00:59:23 +020012476 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012477
12478 Py_DECREF(s);
12479 Py_XDECREF(sep);
12480 return result;
12481}
12482
12483PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012484 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485\n\
12486Return a list of the words in S, using sep as the\n\
12487delimiter string, starting at the end of the string and\n\
12488working to the front. If maxsplit is given, at most maxsplit\n\
12489splits are done. If sep is not specified, any whitespace string\n\
12490is a separator.");
12491
12492static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012493unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012495 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012496 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012497 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012498
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012499 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12500 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012501 return NULL;
12502
12503 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012506 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012508 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012509}
12510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012511PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513\n\
12514Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012515Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012516is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
12518static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012519unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012521 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012522 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012524 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12525 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526 return NULL;
12527
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529}
12530
12531static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012532PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012534 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
12536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539\n\
12540Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012541and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
12543static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012544unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012546 if (PyUnicode_READY(self) == -1)
12547 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012548 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549}
12550
Georg Brandlceee0772007-11-27 23:48:05 +000012551PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012552 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012553\n\
12554Return a translation table usable for str.translate().\n\
12555If there is only one argument, it must be a dictionary mapping Unicode\n\
12556ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012557Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012558If there are two arguments, they must be strings of equal length, and\n\
12559in the resulting dictionary, each character in x will be mapped to the\n\
12560character at the same position in y. If there is a third argument, it\n\
12561must be a string, whose characters will be mapped to None in the result.");
12562
12563static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012564unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012565{
12566 PyObject *x, *y = NULL, *z = NULL;
12567 PyObject *new = NULL, *key, *value;
12568 Py_ssize_t i = 0;
12569 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012570
Georg Brandlceee0772007-11-27 23:48:05 +000012571 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12572 return NULL;
12573 new = PyDict_New();
12574 if (!new)
12575 return NULL;
12576 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 int x_kind, y_kind, z_kind;
12578 void *x_data, *y_data, *z_data;
12579
Georg Brandlceee0772007-11-27 23:48:05 +000012580 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012581 if (!PyUnicode_Check(x)) {
12582 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12583 "be a string if there is a second argument");
12584 goto err;
12585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012587 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12588 "arguments must have equal length");
12589 goto err;
12590 }
12591 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 x_kind = PyUnicode_KIND(x);
12593 y_kind = PyUnicode_KIND(y);
12594 x_data = PyUnicode_DATA(x);
12595 y_data = PyUnicode_DATA(y);
12596 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12597 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012598 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012599 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012600 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012601 if (!value) {
12602 Py_DECREF(key);
12603 goto err;
12604 }
Georg Brandlceee0772007-11-27 23:48:05 +000012605 res = PyDict_SetItem(new, key, value);
12606 Py_DECREF(key);
12607 Py_DECREF(value);
12608 if (res < 0)
12609 goto err;
12610 }
12611 /* create entries for deleting chars in z */
12612 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 z_kind = PyUnicode_KIND(z);
12614 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012615 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012617 if (!key)
12618 goto err;
12619 res = PyDict_SetItem(new, key, Py_None);
12620 Py_DECREF(key);
12621 if (res < 0)
12622 goto err;
12623 }
12624 }
12625 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 int kind;
12627 void *data;
12628
Georg Brandlceee0772007-11-27 23:48:05 +000012629 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012630 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012631 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12632 "to maketrans it must be a dict");
12633 goto err;
12634 }
12635 /* copy entries into the new dict, converting string keys to int keys */
12636 while (PyDict_Next(x, &i, &key, &value)) {
12637 if (PyUnicode_Check(key)) {
12638 /* convert string keys to integer keys */
12639 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012640 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012641 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12642 "table must be of length 1");
12643 goto err;
12644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 kind = PyUnicode_KIND(key);
12646 data = PyUnicode_DATA(key);
12647 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012648 if (!newkey)
12649 goto err;
12650 res = PyDict_SetItem(new, newkey, value);
12651 Py_DECREF(newkey);
12652 if (res < 0)
12653 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012654 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012655 /* just keep integer keys */
12656 if (PyDict_SetItem(new, key, value) < 0)
12657 goto err;
12658 } else {
12659 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12660 "be strings or integers");
12661 goto err;
12662 }
12663 }
12664 }
12665 return new;
12666 err:
12667 Py_DECREF(new);
12668 return NULL;
12669}
12670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012671PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673\n\
12674Return a copy of the string S, where all characters have been mapped\n\
12675through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012676Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012677Unmapped characters are left untouched. Characters mapped to None\n\
12678are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
12680static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012686PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690
12691static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012692unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012694 if (PyUnicode_READY(self) == -1)
12695 return NULL;
12696 if (PyUnicode_IS_ASCII(self))
12697 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012698 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699}
12700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012701PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012704Pad a numeric string S with zeros on the left, to fill a field\n\
12705of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
12707static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012708unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012710 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012711 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012712 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 int kind;
12714 void *data;
12715 Py_UCS4 chr;
12716
Martin v. Löwis18e16552006-02-15 17:27:45 +000012717 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718 return NULL;
12719
Benjamin Petersonbac79492012-01-14 13:34:47 -050012720 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012721 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722
Victor Stinnerc4b49542011-12-11 22:44:26 +010012723 if (PyUnicode_GET_LENGTH(self) >= width)
12724 return unicode_result_unchanged(self);
12725
12726 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727
12728 u = pad(self, fill, 0, '0');
12729
Walter Dörwald068325e2002-04-15 13:36:47 +000012730 if (u == NULL)
12731 return NULL;
12732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 kind = PyUnicode_KIND(u);
12734 data = PyUnicode_DATA(u);
12735 chr = PyUnicode_READ(kind, data, fill);
12736
12737 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 PyUnicode_WRITE(kind, data, 0, chr);
12740 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741 }
12742
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012743 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012744 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
12747#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012748static PyObject *
12749unicode__decimal2ascii(PyObject *self)
12750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012752}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753#endif
12754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012755PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012758Return True if S starts with the specified prefix, False otherwise.\n\
12759With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760With optional end, stop comparing S at that position.\n\
12761prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762
12763static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012764unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012768 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012769 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012770 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012771 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772
Jesus Ceaac451502011-04-20 17:09:23 +020012773 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012775 if (PyTuple_Check(subobj)) {
12776 Py_ssize_t i;
12777 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012778 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779 if (substring == NULL)
12780 return NULL;
12781 result = tailmatch(self, substring, start, end, -1);
12782 Py_DECREF(substring);
12783 if (result) {
12784 Py_RETURN_TRUE;
12785 }
12786 }
12787 /* nothing matched */
12788 Py_RETURN_FALSE;
12789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012790 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012791 if (substring == NULL) {
12792 if (PyErr_ExceptionMatches(PyExc_TypeError))
12793 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12794 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012796 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012799 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800}
12801
12802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012803PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012806Return True if S ends with the specified suffix, False otherwise.\n\
12807With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012808With optional end, stop comparing S at that position.\n\
12809suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
12811static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012812unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012816 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012817 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012818 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012819 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
Jesus Ceaac451502011-04-20 17:09:23 +020012821 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012823 if (PyTuple_Check(subobj)) {
12824 Py_ssize_t i;
12825 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012826 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012828 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 result = tailmatch(self, substring, start, end, +1);
12831 Py_DECREF(substring);
12832 if (result) {
12833 Py_RETURN_TRUE;
12834 }
12835 }
12836 Py_RETURN_FALSE;
12837 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012838 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012839 if (substring == NULL) {
12840 if (PyErr_ExceptionMatches(PyExc_TypeError))
12841 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12842 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012844 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012845 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012847 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848}
12849
Victor Stinner202fdca2012-05-07 12:47:02 +020012850Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012851_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012852{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012853 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012854 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12855 writer->data = PyUnicode_DATA(writer->buffer);
12856 writer->kind = PyUnicode_KIND(writer->buffer);
12857}
12858
Victor Stinnerd3f08822012-05-29 12:57:52 +020012859void
12860_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012861{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012862 memset(writer, 0, sizeof(*writer));
12863#ifdef Py_DEBUG
12864 writer->kind = 5; /* invalid kind */
12865#endif
12866 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012867 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012868}
12869
Victor Stinnerd3f08822012-05-29 12:57:52 +020012870int
12871_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12872 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012873{
12874 Py_ssize_t newlen;
12875 PyObject *newbuffer;
12876
Victor Stinnerd3f08822012-05-29 12:57:52 +020012877 assert(length > 0);
12878
Victor Stinner202fdca2012-05-07 12:47:02 +020012879 if (length > PY_SSIZE_T_MAX - writer->pos) {
12880 PyErr_NoMemory();
12881 return -1;
12882 }
12883 newlen = writer->pos + length;
12884
Victor Stinnerd3f08822012-05-29 12:57:52 +020012885 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012886 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012887 /* overallocate 25% to limit the number of resize */
12888 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12889 newlen += newlen / 4;
12890 if (newlen < writer->min_length)
12891 newlen = writer->min_length;
12892 }
12893 writer->buffer = PyUnicode_New(newlen, maxchar);
12894 if (writer->buffer == NULL)
12895 return -1;
12896 _PyUnicodeWriter_Update(writer);
12897 return 0;
12898 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012899
Victor Stinnerd3f08822012-05-29 12:57:52 +020012900 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012901 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012902 /* overallocate 25% to limit the number of resize */
12903 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12904 newlen += newlen / 4;
12905 if (newlen < writer->min_length)
12906 newlen = writer->min_length;
12907 }
12908
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012909 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012910 /* resize + widen */
12911 newbuffer = PyUnicode_New(newlen, maxchar);
12912 if (newbuffer == NULL)
12913 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012914 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12915 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012916 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012917 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012918 }
12919 else {
12920 newbuffer = resize_compact(writer->buffer, newlen);
12921 if (newbuffer == NULL)
12922 return -1;
12923 }
12924 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012925 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012926 }
12927 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012928 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012929 newbuffer = PyUnicode_New(writer->size, maxchar);
12930 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012931 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012932 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12933 writer->buffer, 0, writer->pos);
12934 Py_DECREF(writer->buffer);
12935 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012936 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012937 }
12938 return 0;
12939}
12940
Victor Stinnerd3f08822012-05-29 12:57:52 +020012941int
12942_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12943{
12944 Py_UCS4 maxchar;
12945 Py_ssize_t len;
12946
12947 if (PyUnicode_READY(str) == -1)
12948 return -1;
12949 len = PyUnicode_GET_LENGTH(str);
12950 if (len == 0)
12951 return 0;
12952 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12953 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012954 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012955 Py_INCREF(str);
12956 writer->buffer = str;
12957 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012958 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012959 writer->size = 0;
12960 writer->pos += len;
12961 return 0;
12962 }
12963 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12964 return -1;
12965 }
12966 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12967 str, 0, len);
12968 writer->pos += len;
12969 return 0;
12970}
12971
12972PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012973_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012974{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012975 if (writer->pos == 0) {
12976 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012977 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012978 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012979 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012980 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12981 return writer->buffer;
12982 }
12983 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12984 PyObject *newbuffer;
12985 newbuffer = resize_compact(writer->buffer, writer->pos);
12986 if (newbuffer == NULL) {
12987 Py_DECREF(writer->buffer);
12988 return NULL;
12989 }
12990 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012991 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012992 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010012993 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012994}
12995
Victor Stinnerd3f08822012-05-29 12:57:52 +020012996void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012997_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012998{
12999 Py_CLEAR(writer->buffer);
13000}
13001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013003
13004PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013006\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013007Return a formatted version of S, using substitutions from args and kwargs.\n\
13008The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013009
Eric Smith27bbca62010-11-04 17:06:58 +000013010PyDoc_STRVAR(format_map__doc__,
13011 "S.format_map(mapping) -> str\n\
13012\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013013Return a formatted version of S, using substitutions from mapping.\n\
13014The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013015
Eric Smith4a7d76d2008-05-30 18:10:19 +000013016static PyObject *
13017unicode__format__(PyObject* self, PyObject* args)
13018{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013019 PyObject *format_spec;
13020 _PyUnicodeWriter writer;
13021 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013022
13023 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13024 return NULL;
13025
Victor Stinnerd3f08822012-05-29 12:57:52 +020013026 if (PyUnicode_READY(self) == -1)
13027 return NULL;
13028 _PyUnicodeWriter_Init(&writer, 0);
13029 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13030 self, format_spec, 0,
13031 PyUnicode_GET_LENGTH(format_spec));
13032 if (ret == -1) {
13033 _PyUnicodeWriter_Dealloc(&writer);
13034 return NULL;
13035 }
13036 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013037}
13038
Eric Smith8c663262007-08-25 02:26:07 +000013039PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013041\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013042Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013043
13044static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013045unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 Py_ssize_t size;
13048
13049 /* If it's a compact object, account for base structure +
13050 character data. */
13051 if (PyUnicode_IS_COMPACT_ASCII(v))
13052 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13053 else if (PyUnicode_IS_COMPACT(v))
13054 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013055 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 else {
13057 /* If it is a two-block object, account for base object, and
13058 for character block if present. */
13059 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013060 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013062 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 }
13064 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013065 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013066 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013068 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013069 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070
13071 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013072}
13073
13074PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013076
13077static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013078unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013079{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013080 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 if (!copy)
13082 return NULL;
13083 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013084}
13085
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013087 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013088 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013089 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13090 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013091 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13092 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013093 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013094 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13095 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13096 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13097 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13098 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013099 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013100 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13101 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13102 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013103 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013104 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13105 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13106 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013107 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013108 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013109 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013110 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013111 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13112 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13113 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13114 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13115 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13116 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13117 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13118 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13119 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13120 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13121 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13122 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13123 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13124 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013125 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013126 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013127 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013128 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013129 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013130 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013131 {"maketrans", (PyCFunction) unicode_maketrans,
13132 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013133 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013134#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013135 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013136 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137#endif
13138
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140 {NULL, NULL}
13141};
13142
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013143static PyObject *
13144unicode_mod(PyObject *v, PyObject *w)
13145{
Brian Curtindfc80e32011-08-10 20:28:54 -050013146 if (!PyUnicode_Check(v))
13147 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013149}
13150
13151static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 0, /*nb_add*/
13153 0, /*nb_subtract*/
13154 0, /*nb_multiply*/
13155 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013156};
13157
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 (lenfunc) unicode_length, /* sq_length */
13160 PyUnicode_Concat, /* sq_concat */
13161 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13162 (ssizeargfunc) unicode_getitem, /* sq_item */
13163 0, /* sq_slice */
13164 0, /* sq_ass_item */
13165 0, /* sq_ass_slice */
13166 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167};
13168
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013169static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013170unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 if (PyUnicode_READY(self) == -1)
13173 return NULL;
13174
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013175 if (PyIndex_Check(item)) {
13176 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013177 if (i == -1 && PyErr_Occurred())
13178 return NULL;
13179 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013180 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013181 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013182 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013183 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013184 PyObject *result;
13185 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013186 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013187 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013189 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013191 return NULL;
13192 }
13193
13194 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013195 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013197 slicelength == PyUnicode_GET_LENGTH(self)) {
13198 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013199 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013200 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013201 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013202 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013203 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013204 src_kind = PyUnicode_KIND(self);
13205 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013206 if (!PyUnicode_IS_ASCII(self)) {
13207 kind_limit = kind_maxchar_limit(src_kind);
13208 max_char = 0;
13209 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13210 ch = PyUnicode_READ(src_kind, src_data, cur);
13211 if (ch > max_char) {
13212 max_char = ch;
13213 if (max_char >= kind_limit)
13214 break;
13215 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013216 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013217 }
Victor Stinner55c99112011-10-13 01:17:06 +020013218 else
13219 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013220 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013221 if (result == NULL)
13222 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013223 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013224 dest_data = PyUnicode_DATA(result);
13225
13226 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013227 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13228 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013229 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013230 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013231 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013232 } else {
13233 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13234 return NULL;
13235 }
13236}
13237
13238static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013239 (lenfunc)unicode_length, /* mp_length */
13240 (binaryfunc)unicode_subscript, /* mp_subscript */
13241 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013242};
13243
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245/* Helpers for PyUnicode_Format() */
13246
13247static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013248getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013250 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 (*p_argidx)++;
13253 if (arglen < 0)
13254 return args;
13255 else
13256 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 }
13258 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260 return NULL;
13261}
13262
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013263/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264
Victor Stinnerd3f08822012-05-29 12:57:52 +020013265static int
13266formatfloat(PyObject *v, int flags, int prec, int type,
13267 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013269 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013271 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013272
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273 x = PyFloat_AsDouble(v);
13274 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013275 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013276
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013279
Eric Smith0923d1d2009-04-16 20:16:10 +000013280 p = PyOS_double_to_string(x, type, prec,
13281 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013282 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013283 return -1;
13284 len = strlen(p);
13285 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013286 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13287 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013289 }
Victor Stinner184252a2012-06-16 02:57:41 +020013290 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 writer->pos += len;
13292 }
13293 else
13294 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013295 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013296 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297}
13298
Victor Stinnerd0880d52012-04-27 23:40:13 +020013299/* formatlong() emulates the format codes d, u, o, x and X, and
13300 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13301 * Python's regular ints.
13302 * Return value: a new PyUnicodeObject*, or NULL if error.
13303 * The output string is of the form
13304 * "-"? ("0x" | "0X")? digit+
13305 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13306 * set in flags. The case of hex digits will be correct,
13307 * There will be at least prec digits, zero-filled on the left if
13308 * necessary to get that many.
13309 * val object to be converted
13310 * flags bitmask of format flags; only F_ALT is looked at
13311 * prec minimum number of digits; 0-fill on left if needed
13312 * type a character in [duoxX]; u acts the same as d
13313 *
13314 * CAUTION: o, x and X conversions on regular ints can never
13315 * produce a '-' sign, but can for Python's unbounded ints.
13316 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013317static PyObject*
13318formatlong(PyObject *val, int flags, int prec, int type)
13319{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013320 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013321 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013322 Py_ssize_t i;
13323 int sign; /* 1 if '-', else 0 */
13324 int len; /* number of characters */
13325 Py_ssize_t llen;
13326 int numdigits; /* len == numnondigits + numdigits */
13327 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013328
Victor Stinnerd0880d52012-04-27 23:40:13 +020013329 /* Avoid exceeding SSIZE_T_MAX */
13330 if (prec > INT_MAX-3) {
13331 PyErr_SetString(PyExc_OverflowError,
13332 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013333 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013334 }
13335
13336 assert(PyLong_Check(val));
13337
13338 switch (type) {
13339 case 'd':
13340 case 'u':
13341 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013342 if (PyBool_Check(val))
13343 result = PyNumber_ToBase(val, 10);
13344 else
13345 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013346 break;
13347 case 'o':
13348 numnondigits = 2;
13349 result = PyNumber_ToBase(val, 8);
13350 break;
13351 case 'x':
13352 case 'X':
13353 numnondigits = 2;
13354 result = PyNumber_ToBase(val, 16);
13355 break;
13356 default:
13357 assert(!"'type' not in [duoxX]");
13358 }
13359 if (!result)
13360 return NULL;
13361
13362 assert(unicode_modifiable(result));
13363 assert(PyUnicode_IS_READY(result));
13364 assert(PyUnicode_IS_ASCII(result));
13365
13366 /* To modify the string in-place, there can only be one reference. */
13367 if (Py_REFCNT(result) != 1) {
13368 PyErr_BadInternalCall();
13369 return NULL;
13370 }
13371 buf = PyUnicode_DATA(result);
13372 llen = PyUnicode_GET_LENGTH(result);
13373 if (llen > INT_MAX) {
13374 PyErr_SetString(PyExc_ValueError,
13375 "string too large in _PyBytes_FormatLong");
13376 return NULL;
13377 }
13378 len = (int)llen;
13379 sign = buf[0] == '-';
13380 numnondigits += sign;
13381 numdigits = len - numnondigits;
13382 assert(numdigits > 0);
13383
13384 /* Get rid of base marker unless F_ALT */
13385 if (((flags & F_ALT) == 0 &&
13386 (type == 'o' || type == 'x' || type == 'X'))) {
13387 assert(buf[sign] == '0');
13388 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13389 buf[sign+1] == 'o');
13390 numnondigits -= 2;
13391 buf += 2;
13392 len -= 2;
13393 if (sign)
13394 buf[0] = '-';
13395 assert(len == numnondigits + numdigits);
13396 assert(numdigits > 0);
13397 }
13398
13399 /* Fill with leading zeroes to meet minimum width. */
13400 if (prec > numdigits) {
13401 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13402 numnondigits + prec);
13403 char *b1;
13404 if (!r1) {
13405 Py_DECREF(result);
13406 return NULL;
13407 }
13408 b1 = PyBytes_AS_STRING(r1);
13409 for (i = 0; i < numnondigits; ++i)
13410 *b1++ = *buf++;
13411 for (i = 0; i < prec - numdigits; i++)
13412 *b1++ = '0';
13413 for (i = 0; i < numdigits; i++)
13414 *b1++ = *buf++;
13415 *b1 = '\0';
13416 Py_DECREF(result);
13417 result = r1;
13418 buf = PyBytes_AS_STRING(result);
13419 len = numnondigits + prec;
13420 }
13421
13422 /* Fix up case for hex conversions. */
13423 if (type == 'X') {
13424 /* Need to convert all lower case letters to upper case.
13425 and need to convert 0x to 0X (and -0x to -0X). */
13426 for (i = 0; i < len; i++)
13427 if (buf[i] >= 'a' && buf[i] <= 'x')
13428 buf[i] -= 'a'-'A';
13429 }
13430 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13431 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013432 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013433 Py_DECREF(result);
13434 result = unicode;
13435 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013436 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013437}
13438
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013439static Py_UCS4
13440formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013442 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013443 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013445 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013446 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 goto onError;
13448 }
13449 else {
13450 /* Integer input truncated to a character */
13451 long x;
13452 x = PyLong_AsLong(v);
13453 if (x == -1 && PyErr_Occurred())
13454 goto onError;
13455
Victor Stinner8faf8212011-12-08 22:14:11 +010013456 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 PyErr_SetString(PyExc_OverflowError,
13458 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013459 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 }
13461
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013462 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013463 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013464
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013466 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013468 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469}
13470
Alexander Belopolsky40018472011-02-26 01:02:56 +000013471PyObject *
13472PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013473{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013477 PyObject *temp = NULL;
13478 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013479 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013480 void *fmt;
13481 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013482 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013483 Py_ssize_t sublen;
13484 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013485
Guido van Rossumd57fd912000-03-10 22:53:23 +000013486 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013487 PyErr_BadInternalCall();
13488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013489 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013490 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013491 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013493 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013494 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013495 return NULL;
13496 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 fmt = PyUnicode_DATA(uformat);
13499 fmtkind = PyUnicode_KIND(uformat);
13500 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13501 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013504
Guido van Rossumd57fd912000-03-10 22:53:23 +000013505 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 arglen = PyTuple_Size(args);
13507 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508 }
13509 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 arglen = -1;
13511 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013513 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013514 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515
13516 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013517 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013518 Py_ssize_t nonfmtpos;
13519 nonfmtpos = fmtpos++;
13520 while (fmtcnt >= 0 &&
13521 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13522 fmtpos++;
13523 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013524 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013525 if (fmtcnt < 0)
13526 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013527 sublen = fmtpos - nonfmtpos;
13528 maxchar = _PyUnicode_FindMaxChar(uformat,
13529 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013530 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013531 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013532
Victor Stinnerd3f08822012-05-29 12:57:52 +020013533 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13534 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013535 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013536 }
13537 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 /* Got a format specifier */
13539 int flags = 0;
13540 Py_ssize_t width = -1;
13541 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013542 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013543 Py_UCS4 fill;
13544 int sign;
13545 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 int isnumok;
13547 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013548 void *pbuf = NULL;
13549 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013550 Py_UCS4 bufmaxchar;
13551 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013553 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013554 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13555 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013556 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 Py_ssize_t keylen;
13558 PyObject *key;
13559 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013560
Benjamin Peterson29060642009-01-31 22:14:21 +000013561 if (dict == NULL) {
13562 PyErr_SetString(PyExc_TypeError,
13563 "format requires a mapping");
13564 goto onError;
13565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013566 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013568 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 /* Skip over balanced parentheses */
13570 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013571 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13572 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013574 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013578 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 if (fmtcnt < 0 || pcount > 0) {
13580 PyErr_SetString(PyExc_ValueError,
13581 "incomplete format key");
13582 goto onError;
13583 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013584 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013585 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 if (key == NULL)
13587 goto onError;
13588 if (args_owned) {
13589 Py_DECREF(args);
13590 args_owned = 0;
13591 }
13592 args = PyObject_GetItem(dict, key);
13593 Py_DECREF(key);
13594 if (args == NULL) {
13595 goto onError;
13596 }
13597 args_owned = 1;
13598 arglen = -1;
13599 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013600 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013602 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13603 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 case '-': flags |= F_LJUST; continue;
13605 case '+': flags |= F_SIGN; continue;
13606 case ' ': flags |= F_BLANK; continue;
13607 case '#': flags |= F_ALT; continue;
13608 case '0': flags |= F_ZERO; continue;
13609 }
13610 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013611 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013612 if (c == '*') {
13613 v = getnextarg(args, arglen, &argidx);
13614 if (v == NULL)
13615 goto onError;
13616 if (!PyLong_Check(v)) {
13617 PyErr_SetString(PyExc_TypeError,
13618 "* wants int");
13619 goto onError;
13620 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013621 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 if (width == -1 && PyErr_Occurred())
13623 goto onError;
13624 if (width < 0) {
13625 flags |= F_LJUST;
13626 width = -width;
13627 }
13628 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013630 }
13631 else if (c >= '0' && c <= '9') {
13632 width = c - '0';
13633 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 if (c < '0' || c > '9')
13636 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013637 /* Since c is unsigned, the RHS would end up as unsigned,
13638 mixing signed and unsigned comparison. Since c is between
13639 '0' and '9', casting to int is safe. */
13640 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 PyErr_SetString(PyExc_ValueError,
13642 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013643 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 }
13645 width = width*10 + (c - '0');
13646 }
13647 }
13648 if (c == '.') {
13649 prec = 0;
13650 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013651 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 if (c == '*') {
13653 v = getnextarg(args, arglen, &argidx);
13654 if (v == NULL)
13655 goto onError;
13656 if (!PyLong_Check(v)) {
13657 PyErr_SetString(PyExc_TypeError,
13658 "* wants int");
13659 goto onError;
13660 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013661 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 if (prec == -1 && PyErr_Occurred())
13663 goto onError;
13664 if (prec < 0)
13665 prec = 0;
13666 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 }
13669 else if (c >= '0' && c <= '9') {
13670 prec = c - '0';
13671 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013672 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 if (c < '0' || c > '9')
13674 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013675 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 PyErr_SetString(PyExc_ValueError,
13677 "prec too big");
13678 goto onError;
13679 }
13680 prec = prec*10 + (c - '0');
13681 }
13682 }
13683 } /* prec */
13684 if (fmtcnt >= 0) {
13685 if (c == 'h' || c == 'l' || c == 'L') {
13686 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 }
13689 }
13690 if (fmtcnt < 0) {
13691 PyErr_SetString(PyExc_ValueError,
13692 "incomplete format");
13693 goto onError;
13694 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013695 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013696 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013697
13698 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013699 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013700 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013701 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13702 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013703 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013704 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013705
Victor Stinneraff3cc62012-04-30 05:19:21 +020013706 v = getnextarg(args, arglen, &argidx);
13707 if (v == NULL)
13708 goto onError;
13709
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013711 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 fill = ' ';
13713 switch (c) {
13714
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 case 's':
13716 case 'r':
13717 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013718 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13719 /* Fast path */
13720 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13721 goto onError;
13722 goto nextarg;
13723 }
13724
Victor Stinner808fc0a2010-03-22 12:50:40 +000013725 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 temp = v;
13727 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013728 }
13729 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 if (c == 's')
13731 temp = PyObject_Str(v);
13732 else if (c == 'r')
13733 temp = PyObject_Repr(v);
13734 else
13735 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013736 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 break;
13738
13739 case 'i':
13740 case 'd':
13741 case 'u':
13742 case 'o':
13743 case 'x':
13744 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013745 if (PyLong_CheckExact(v)
13746 && width == -1 && prec == -1
13747 && !(flags & (F_SIGN | F_BLANK)))
13748 {
13749 /* Fast path */
13750 switch(c)
13751 {
13752 case 'd':
13753 case 'i':
13754 case 'u':
13755 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13756 goto onError;
13757 goto nextarg;
13758 case 'x':
13759 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13760 goto onError;
13761 goto nextarg;
13762 case 'o':
13763 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13764 goto onError;
13765 goto nextarg;
13766 default:
13767 break;
13768 }
13769 }
13770
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 isnumok = 0;
13772 if (PyNumber_Check(v)) {
13773 PyObject *iobj=NULL;
13774
13775 if (PyLong_Check(v)) {
13776 iobj = v;
13777 Py_INCREF(iobj);
13778 }
13779 else {
13780 iobj = PyNumber_Long(v);
13781 }
13782 if (iobj!=NULL) {
13783 if (PyLong_Check(iobj)) {
13784 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013785 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013786 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013787 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013788 }
13789 else {
13790 Py_DECREF(iobj);
13791 }
13792 }
13793 }
13794 if (!isnumok) {
13795 PyErr_Format(PyExc_TypeError,
13796 "%%%c format: a number is required, "
13797 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13798 goto onError;
13799 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013800 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 fill = '0';
13802 break;
13803
13804 case 'e':
13805 case 'E':
13806 case 'f':
13807 case 'F':
13808 case 'g':
13809 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013810 if (width == -1 && prec == -1
13811 && !(flags & (F_SIGN | F_BLANK)))
13812 {
13813 /* Fast path */
13814 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13815 goto onError;
13816 goto nextarg;
13817 }
13818
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013820 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013822 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13823 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013824 break;
13825
13826 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013827 {
13828 Py_UCS4 ch = formatchar(v);
13829 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013831 if (width == -1 && prec == -1) {
13832 /* Fast path */
13833 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13834 goto onError;
13835 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13836 writer.pos += 1;
13837 goto nextarg;
13838 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013839 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013841 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013842
13843 default:
13844 PyErr_Format(PyExc_ValueError,
13845 "unsupported format character '%c' (0x%x) "
13846 "at index %zd",
13847 (31<=c && c<=126) ? (char)c : '?',
13848 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 goto onError;
13851 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013852 if (temp == NULL)
13853 goto onError;
13854 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855
13856 if (width == -1 && prec == -1
13857 && !(flags & (F_SIGN | F_BLANK)))
13858 {
13859 /* Fast path */
13860 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13861 goto onError;
13862 goto nextarg;
13863 }
13864
Victor Stinneraff3cc62012-04-30 05:19:21 +020013865 if (PyUnicode_READY(temp) == -1) {
13866 Py_CLEAR(temp);
13867 goto onError;
13868 }
13869 kind = PyUnicode_KIND(temp);
13870 pbuf = PyUnicode_DATA(temp);
13871 len = PyUnicode_GET_LENGTH(temp);
13872
13873 if (c == 's' || c == 'r' || c == 'a') {
13874 if (prec >= 0 && len > prec)
13875 len = prec;
13876 }
13877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 /* pbuf is initialized here. */
13879 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013880 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013881 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13882 if (ch == '-' || ch == '+') {
13883 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013884 len--;
13885 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013886 }
13887 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013888 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013889 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013890 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013891 else
13892 sign = 0;
13893 }
13894 if (width < len)
13895 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013896
13897 /* Compute the length and maximum character of the
13898 written characters */
13899 bufmaxchar = 127;
13900 if (!(flags & F_LJUST)) {
13901 if (sign) {
13902 if ((width-1) > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013903 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013904 }
13905 else {
13906 if (width > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013907 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013908 }
13909 }
13910 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Benjamin Peterson7e303732013-06-10 09:19:46 -070013911 bufmaxchar = Py_MAX(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013912
13913 buflen = width;
13914 if (sign && len == width)
13915 buflen++;
13916
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013917 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013918 goto onError;
13919
13920 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013922 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013923 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13924 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013925 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013926 if (width > len)
13927 width--;
13928 }
13929 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013930 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013931 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013933 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13934 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13935 writer.pos += 2;
13936 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013938 width -= 2;
13939 if (width < 0)
13940 width = 0;
13941 len -= 2;
13942 }
13943 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013944 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013945 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13946 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013947 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 }
13949 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013950 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013951 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13952 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013953 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013954 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013955 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13956 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013957 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13958 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13959 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013960 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013961 }
13962 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013963
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013964 if (len) {
13965 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13966 temp, pindex, len);
13967 writer.pos += len;
13968 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013969 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013970 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013971 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13972 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013973 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013974
Victor Stinnerd3f08822012-05-29 12:57:52 +020013975nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013976 if (dict && (argidx < arglen) && c != '%') {
13977 PyErr_SetString(PyExc_TypeError,
13978 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013979 goto onError;
13980 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013981 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983 } /* until end */
13984 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 PyErr_SetString(PyExc_TypeError,
13986 "not all arguments converted during string formatting");
13987 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988 }
13989
13990 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013991 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992 }
13993 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013994 Py_XDECREF(temp);
13995 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013996 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013997
Benjamin Peterson29060642009-01-31 22:14:21 +000013998 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014000 Py_XDECREF(temp);
14001 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014002 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014003 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005 }
14006 return NULL;
14007}
14008
Jeremy Hylton938ace62002-07-17 16:30:39 +000014009static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014010unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14011
Tim Peters6d6c1a32001-08-02 04:15:00 +000014012static PyObject *
14013unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14014{
Benjamin Peterson29060642009-01-31 22:14:21 +000014015 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 static char *kwlist[] = {"object", "encoding", "errors", 0};
14017 char *encoding = NULL;
14018 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014019
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 if (type != &PyUnicode_Type)
14021 return unicode_subtype_new(type, args, kwds);
14022 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014023 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 return NULL;
14025 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014026 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 if (encoding == NULL && errors == NULL)
14028 return PyObject_Str(x);
14029 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014030 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014031}
14032
Guido van Rossume023fe02001-08-30 03:12:59 +000014033static PyObject *
14034unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14035{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014036 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014037 Py_ssize_t length, char_size;
14038 int share_wstr, share_utf8;
14039 unsigned int kind;
14040 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014041
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014044 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014045 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014046 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014047 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014048 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014049 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014051 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014053 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014054 if (self == NULL) {
14055 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014056 return NULL;
14057 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014058 kind = PyUnicode_KIND(unicode);
14059 length = PyUnicode_GET_LENGTH(unicode);
14060
14061 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014062#ifdef Py_DEBUG
14063 _PyUnicode_HASH(self) = -1;
14064#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014065 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014066#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014067 _PyUnicode_STATE(self).interned = 0;
14068 _PyUnicode_STATE(self).kind = kind;
14069 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014070 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014071 _PyUnicode_STATE(self).ready = 1;
14072 _PyUnicode_WSTR(self) = NULL;
14073 _PyUnicode_UTF8_LENGTH(self) = 0;
14074 _PyUnicode_UTF8(self) = NULL;
14075 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014076 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014077
14078 share_utf8 = 0;
14079 share_wstr = 0;
14080 if (kind == PyUnicode_1BYTE_KIND) {
14081 char_size = 1;
14082 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14083 share_utf8 = 1;
14084 }
14085 else if (kind == PyUnicode_2BYTE_KIND) {
14086 char_size = 2;
14087 if (sizeof(wchar_t) == 2)
14088 share_wstr = 1;
14089 }
14090 else {
14091 assert(kind == PyUnicode_4BYTE_KIND);
14092 char_size = 4;
14093 if (sizeof(wchar_t) == 4)
14094 share_wstr = 1;
14095 }
14096
14097 /* Ensure we won't overflow the length. */
14098 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14099 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014100 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014101 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014102 data = PyObject_MALLOC((length + 1) * char_size);
14103 if (data == NULL) {
14104 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014105 goto onError;
14106 }
14107
Victor Stinnerc3c74152011-10-02 20:39:55 +020014108 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014109 if (share_utf8) {
14110 _PyUnicode_UTF8_LENGTH(self) = length;
14111 _PyUnicode_UTF8(self) = data;
14112 }
14113 if (share_wstr) {
14114 _PyUnicode_WSTR_LENGTH(self) = length;
14115 _PyUnicode_WSTR(self) = (wchar_t *)data;
14116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014117
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014118 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014119 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014120 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014121#ifdef Py_DEBUG
14122 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14123#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014124 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014125 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014126
14127onError:
14128 Py_DECREF(unicode);
14129 Py_DECREF(self);
14130 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014131}
14132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014133PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014134"str(object='') -> str\n\
14135str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014136\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014137Create a new string object from the given object. If encoding or\n\
14138errors is specified, then the object must expose a data buffer\n\
14139that will be decoded using the given encoding and error handler.\n\
14140Otherwise, returns the result of object.__str__() (if defined)\n\
14141or repr(object).\n\
14142encoding defaults to sys.getdefaultencoding().\n\
14143errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014144
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014145static PyObject *unicode_iter(PyObject *seq);
14146
Guido van Rossumd57fd912000-03-10 22:53:23 +000014147PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014148 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014149 "str", /* tp_name */
14150 sizeof(PyUnicodeObject), /* tp_size */
14151 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014152 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014153 (destructor)unicode_dealloc, /* tp_dealloc */
14154 0, /* tp_print */
14155 0, /* tp_getattr */
14156 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014157 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 unicode_repr, /* tp_repr */
14159 &unicode_as_number, /* tp_as_number */
14160 &unicode_as_sequence, /* tp_as_sequence */
14161 &unicode_as_mapping, /* tp_as_mapping */
14162 (hashfunc) unicode_hash, /* tp_hash*/
14163 0, /* tp_call*/
14164 (reprfunc) unicode_str, /* tp_str */
14165 PyObject_GenericGetAttr, /* tp_getattro */
14166 0, /* tp_setattro */
14167 0, /* tp_as_buffer */
14168 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014169 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 unicode_doc, /* tp_doc */
14171 0, /* tp_traverse */
14172 0, /* tp_clear */
14173 PyUnicode_RichCompare, /* tp_richcompare */
14174 0, /* tp_weaklistoffset */
14175 unicode_iter, /* tp_iter */
14176 0, /* tp_iternext */
14177 unicode_methods, /* tp_methods */
14178 0, /* tp_members */
14179 0, /* tp_getset */
14180 &PyBaseObject_Type, /* tp_base */
14181 0, /* tp_dict */
14182 0, /* tp_descr_get */
14183 0, /* tp_descr_set */
14184 0, /* tp_dictoffset */
14185 0, /* tp_init */
14186 0, /* tp_alloc */
14187 unicode_new, /* tp_new */
14188 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014189};
14190
14191/* Initialize the Unicode implementation */
14192
Victor Stinner3a50e702011-10-18 21:21:00 +020014193int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014195 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014196 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014197 0x000A, /* LINE FEED */
14198 0x000D, /* CARRIAGE RETURN */
14199 0x001C, /* FILE SEPARATOR */
14200 0x001D, /* GROUP SEPARATOR */
14201 0x001E, /* RECORD SEPARATOR */
14202 0x0085, /* NEXT LINE */
14203 0x2028, /* LINE SEPARATOR */
14204 0x2029, /* PARAGRAPH SEPARATOR */
14205 };
14206
Fred Drakee4315f52000-05-09 19:53:39 +000014207 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014208 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014209 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014210 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014211 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014212
Guido van Rossumcacfc072002-05-24 19:01:59 +000014213 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014214 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014215
14216 /* initialize the linebreak bloom filter */
14217 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014218 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014219 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014220
Christian Heimes26532f72013-07-20 14:57:16 +020014221 if (PyType_Ready(&EncodingMapType) < 0)
14222 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014223
Benjamin Petersonc4311282012-10-30 23:21:10 -040014224 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14225 Py_FatalError("Can't initialize field name iterator type");
14226
14227 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14228 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014229
Victor Stinner3a50e702011-10-18 21:21:00 +020014230#ifdef HAVE_MBCS
14231 winver.dwOSVersionInfoSize = sizeof(winver);
14232 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14233 PyErr_SetFromWindowsErr(0);
14234 return -1;
14235 }
14236#endif
14237 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014238}
14239
14240/* Finalize the Unicode implementation */
14241
Christian Heimesa156e092008-02-16 07:38:31 +000014242int
14243PyUnicode_ClearFreeList(void)
14244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014245 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014246}
14247
Guido van Rossumd57fd912000-03-10 22:53:23 +000014248void
Thomas Wouters78890102000-07-22 19:25:51 +000014249_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014250{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014251 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014252
Serhiy Storchaka05997252013-01-26 12:14:02 +020014253 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014254
Serhiy Storchaka05997252013-01-26 12:14:02 +020014255 for (i = 0; i < 256; i++)
14256 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014257 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014258 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014260
Walter Dörwald16807132007-05-25 13:52:07 +000014261void
14262PyUnicode_InternInPlace(PyObject **p)
14263{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014264 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014265 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014266#ifdef Py_DEBUG
14267 assert(s != NULL);
14268 assert(_PyUnicode_CHECK(s));
14269#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014271 return;
14272#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 /* If it's a subclass, we don't really know what putting
14274 it in the interned dict might do. */
14275 if (!PyUnicode_CheckExact(s))
14276 return;
14277 if (PyUnicode_CHECK_INTERNED(s))
14278 return;
14279 if (interned == NULL) {
14280 interned = PyDict_New();
14281 if (interned == NULL) {
14282 PyErr_Clear(); /* Don't leave an exception */
14283 return;
14284 }
14285 }
14286 /* It might be that the GetItem call fails even
14287 though the key is present in the dictionary,
14288 namely when this happens during a stack overflow. */
14289 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014290 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014291 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014292
Benjamin Peterson29060642009-01-31 22:14:21 +000014293 if (t) {
14294 Py_INCREF(t);
14295 Py_DECREF(*p);
14296 *p = t;
14297 return;
14298 }
Walter Dörwald16807132007-05-25 13:52:07 +000014299
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014301 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 PyErr_Clear();
14303 PyThreadState_GET()->recursion_critical = 0;
14304 return;
14305 }
14306 PyThreadState_GET()->recursion_critical = 0;
14307 /* The two references in interned are not counted by refcnt.
14308 The deallocator will take care of this */
14309 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014310 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014311}
14312
14313void
14314PyUnicode_InternImmortal(PyObject **p)
14315{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 PyUnicode_InternInPlace(p);
14317 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014318 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014319 Py_INCREF(*p);
14320 }
Walter Dörwald16807132007-05-25 13:52:07 +000014321}
14322
14323PyObject *
14324PyUnicode_InternFromString(const char *cp)
14325{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 PyObject *s = PyUnicode_FromString(cp);
14327 if (s == NULL)
14328 return NULL;
14329 PyUnicode_InternInPlace(&s);
14330 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014331}
14332
Alexander Belopolsky40018472011-02-26 01:02:56 +000014333void
14334_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014336 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014337 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014338 Py_ssize_t i, n;
14339 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014340
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 if (interned == NULL || !PyDict_Check(interned))
14342 return;
14343 keys = PyDict_Keys(interned);
14344 if (keys == NULL || !PyList_Check(keys)) {
14345 PyErr_Clear();
14346 return;
14347 }
Walter Dörwald16807132007-05-25 13:52:07 +000014348
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14350 detector, interned unicode strings are not forcibly deallocated;
14351 rather, we give them their stolen references back, and then clear
14352 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014353
Benjamin Peterson14339b62009-01-31 16:36:08 +000014354 n = PyList_GET_SIZE(keys);
14355 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014356 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014358 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014359 if (PyUnicode_READY(s) == -1) {
14360 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014361 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014363 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 case SSTATE_NOT_INTERNED:
14365 /* XXX Shouldn't happen */
14366 break;
14367 case SSTATE_INTERNED_IMMORTAL:
14368 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014369 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 break;
14371 case SSTATE_INTERNED_MORTAL:
14372 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014373 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 break;
14375 default:
14376 Py_FatalError("Inconsistent interned string state.");
14377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014378 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014379 }
14380 fprintf(stderr, "total size of all interned strings: "
14381 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14382 "mortal/immortal\n", mortal_size, immortal_size);
14383 Py_DECREF(keys);
14384 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014385 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014386}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014387
14388
14389/********************* Unicode Iterator **************************/
14390
14391typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 PyObject_HEAD
14393 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014394 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014395} unicodeiterobject;
14396
14397static void
14398unicodeiter_dealloc(unicodeiterobject *it)
14399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014400 _PyObject_GC_UNTRACK(it);
14401 Py_XDECREF(it->it_seq);
14402 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014403}
14404
14405static int
14406unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014408 Py_VISIT(it->it_seq);
14409 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014410}
14411
14412static PyObject *
14413unicodeiter_next(unicodeiterobject *it)
14414{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014415 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014416
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 assert(it != NULL);
14418 seq = it->it_seq;
14419 if (seq == NULL)
14420 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014421 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014423 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14424 int kind = PyUnicode_KIND(seq);
14425 void *data = PyUnicode_DATA(seq);
14426 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14427 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014428 if (item != NULL)
14429 ++it->it_index;
14430 return item;
14431 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014432
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 Py_DECREF(seq);
14434 it->it_seq = NULL;
14435 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014436}
14437
14438static PyObject *
14439unicodeiter_len(unicodeiterobject *it)
14440{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014441 Py_ssize_t len = 0;
14442 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014443 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014444 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014445}
14446
14447PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14448
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014449static PyObject *
14450unicodeiter_reduce(unicodeiterobject *it)
14451{
14452 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014453 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014454 it->it_seq, it->it_index);
14455 } else {
14456 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14457 if (u == NULL)
14458 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014459 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014460 }
14461}
14462
14463PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14464
14465static PyObject *
14466unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14467{
14468 Py_ssize_t index = PyLong_AsSsize_t(state);
14469 if (index == -1 && PyErr_Occurred())
14470 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000014471 if (it->it_seq != NULL) {
14472 if (index < 0)
14473 index = 0;
14474 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14475 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14476 it->it_index = index;
14477 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014478 Py_RETURN_NONE;
14479}
14480
14481PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14482
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014483static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014485 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014486 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14487 reduce_doc},
14488 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14489 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014490 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014491};
14492
14493PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14495 "str_iterator", /* tp_name */
14496 sizeof(unicodeiterobject), /* tp_basicsize */
14497 0, /* tp_itemsize */
14498 /* methods */
14499 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14500 0, /* tp_print */
14501 0, /* tp_getattr */
14502 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014503 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 0, /* tp_repr */
14505 0, /* tp_as_number */
14506 0, /* tp_as_sequence */
14507 0, /* tp_as_mapping */
14508 0, /* tp_hash */
14509 0, /* tp_call */
14510 0, /* tp_str */
14511 PyObject_GenericGetAttr, /* tp_getattro */
14512 0, /* tp_setattro */
14513 0, /* tp_as_buffer */
14514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14515 0, /* tp_doc */
14516 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14517 0, /* tp_clear */
14518 0, /* tp_richcompare */
14519 0, /* tp_weaklistoffset */
14520 PyObject_SelfIter, /* tp_iter */
14521 (iternextfunc)unicodeiter_next, /* tp_iternext */
14522 unicodeiter_methods, /* tp_methods */
14523 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014524};
14525
14526static PyObject *
14527unicode_iter(PyObject *seq)
14528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014530
Benjamin Peterson14339b62009-01-31 16:36:08 +000014531 if (!PyUnicode_Check(seq)) {
14532 PyErr_BadInternalCall();
14533 return NULL;
14534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014535 if (PyUnicode_READY(seq) == -1)
14536 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14538 if (it == NULL)
14539 return NULL;
14540 it->it_index = 0;
14541 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014542 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 _PyObject_GC_TRACK(it);
14544 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014545}
14546
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014547
14548size_t
14549Py_UNICODE_strlen(const Py_UNICODE *u)
14550{
14551 int res = 0;
14552 while(*u++)
14553 res++;
14554 return res;
14555}
14556
14557Py_UNICODE*
14558Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14559{
14560 Py_UNICODE *u = s1;
14561 while ((*u++ = *s2++));
14562 return s1;
14563}
14564
14565Py_UNICODE*
14566Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14567{
14568 Py_UNICODE *u = s1;
14569 while ((*u++ = *s2++))
14570 if (n-- == 0)
14571 break;
14572 return s1;
14573}
14574
14575Py_UNICODE*
14576Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14577{
14578 Py_UNICODE *u1 = s1;
14579 u1 += Py_UNICODE_strlen(u1);
14580 Py_UNICODE_strcpy(u1, s2);
14581 return s1;
14582}
14583
14584int
14585Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14586{
14587 while (*s1 && *s2 && *s1 == *s2)
14588 s1++, s2++;
14589 if (*s1 && *s2)
14590 return (*s1 < *s2) ? -1 : +1;
14591 if (*s1)
14592 return 1;
14593 if (*s2)
14594 return -1;
14595 return 0;
14596}
14597
14598int
14599Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14600{
14601 register Py_UNICODE u1, u2;
14602 for (; n != 0; n--) {
14603 u1 = *s1;
14604 u2 = *s2;
14605 if (u1 != u2)
14606 return (u1 < u2) ? -1 : +1;
14607 if (u1 == '\0')
14608 return 0;
14609 s1++;
14610 s2++;
14611 }
14612 return 0;
14613}
14614
14615Py_UNICODE*
14616Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14617{
14618 const Py_UNICODE *p;
14619 for (p = s; *p; p++)
14620 if (*p == c)
14621 return (Py_UNICODE*)p;
14622 return NULL;
14623}
14624
14625Py_UNICODE*
14626Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14627{
14628 const Py_UNICODE *p;
14629 p = s + Py_UNICODE_strlen(s);
14630 while (p != s) {
14631 p--;
14632 if (*p == c)
14633 return (Py_UNICODE*)p;
14634 }
14635 return NULL;
14636}
Victor Stinner331ea922010-08-10 16:37:20 +000014637
Victor Stinner71133ff2010-09-01 23:43:53 +000014638Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014639PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014640{
Victor Stinner577db2c2011-10-11 22:12:48 +020014641 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014642 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014644 if (!PyUnicode_Check(unicode)) {
14645 PyErr_BadArgument();
14646 return NULL;
14647 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014648 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014649 if (u == NULL)
14650 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014651 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014652 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014653 PyErr_NoMemory();
14654 return NULL;
14655 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014656 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014657 size *= sizeof(Py_UNICODE);
14658 copy = PyMem_Malloc(size);
14659 if (copy == NULL) {
14660 PyErr_NoMemory();
14661 return NULL;
14662 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014663 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014664 return copy;
14665}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014666
Georg Brandl66c221e2010-10-14 07:04:07 +000014667/* A _string module, to export formatter_parser and formatter_field_name_split
14668 to the string.Formatter class implemented in Python. */
14669
14670static PyMethodDef _string_methods[] = {
14671 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14672 METH_O, PyDoc_STR("split the argument as a field name")},
14673 {"formatter_parser", (PyCFunction) formatter_parser,
14674 METH_O, PyDoc_STR("parse the argument as a format string")},
14675 {NULL, NULL}
14676};
14677
14678static struct PyModuleDef _string_module = {
14679 PyModuleDef_HEAD_INIT,
14680 "_string",
14681 PyDoc_STR("string helper module"),
14682 0,
14683 _string_methods,
14684 NULL,
14685 NULL,
14686 NULL,
14687 NULL
14688};
14689
14690PyMODINIT_FUNC
14691PyInit__string(void)
14692{
14693 return PyModule_Create(&_string_module);
14694}
14695
14696
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014697#ifdef __cplusplus
14698}
14699#endif