blob: 24eda6be9bfbd41ddb2cc8cf5a7e978437b170a9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinner910337b2011-10-03 03:20:16 +0200115#undef PyUnicode_READY
116#define PyUnicode_READY(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200119 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100120 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200121
Victor Stinnerc379ead2011-10-03 12:52:27 +0200122#define _PyUnicode_SHARE_UTF8(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
125 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
126#define _PyUnicode_SHARE_WSTR(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
129
Victor Stinner829c0ad2011-10-03 01:08:02 +0200130/* true if the Unicode object has an allocated UTF-8 memory block
131 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_HAS_UTF8_MEMORY(op) \
133 (assert(_PyUnicode_CHECK(op)), \
134 (!PyUnicode_IS_COMPACT_ASCII(op) \
135 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200136 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
137
Victor Stinner03490912011-10-03 23:45:12 +0200138/* true if the Unicode object has an allocated wstr memory block
139 (not shared with other data) */
140#define _PyUnicode_HAS_WSTR_MEMORY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (_PyUnicode_WSTR(op) && \
143 (!PyUnicode_IS_READY(op) || \
144 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
145
Victor Stinner910337b2011-10-03 03:20:16 +0200146/* Generic helper macro to convert characters of different types.
147 from_type and to_type have to be valid type names, begin and end
148 are pointers to the source characters which should be of type
149 "from_type *". to is a pointer of type "to_type *" and points to the
150 buffer where the result characters are written to. */
151#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
152 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200153 to_type *_to = (to_type *) to; \
154 const from_type *_iter = (begin); \
155 const from_type *_end = (end); \
156 Py_ssize_t n = (_end) - (_iter); \
157 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200158 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200159 while (_iter < (_unrolled_end)) { \
160 _to[0] = (to_type) _iter[0]; \
161 _to[1] = (to_type) _iter[1]; \
162 _to[2] = (to_type) _iter[2]; \
163 _to[3] = (to_type) _iter[3]; \
164 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 while (_iter < (_end)) \
167 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200168 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200169
Walter Dörwald16807132007-05-25 13:52:07 +0000170/* This dictionary holds all interned unicode strings. Note that references
171 to strings in this dictionary are *not* counted in the string's ob_refcnt.
172 When the interned string reaches a refcnt of 0 the string deallocation
173 function will delete the reference from this dictionary.
174
175 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000176 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000177*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200178static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 do { \
185 if (unicode_empty != NULL) \
186 Py_INCREF(unicode_empty); \
187 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 unicode_empty = PyUnicode_New(0, 0); \
189 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
192 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200193 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000195
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196#define _Py_RETURN_UNICODE_EMPTY() \
197 do { \
198 _Py_INCREF_UNICODE_EMPTY(); \
199 return unicode_empty; \
200 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200202/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200203static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200204
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205/* Single character Unicode strings in the Latin-1 range are being
206 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208
Christian Heimes190d79e2008-01-30 11:58:22 +0000209/* Fast detection of the most frequent whitespace characters */
210const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000212/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000214/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* case 0x000C: * FORM FEED */
216/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 1, 1, 1, 1, 1, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x001C: * FILE SEPARATOR */
220/* case 0x001D: * GROUP SEPARATOR */
221/* case 0x001E: * RECORD SEPARATOR */
222/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000223 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 1, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000229
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000238};
239
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200240/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200242static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100243static int unicode_modifiable(PyObject *unicode);
244
Victor Stinnerfe226c02011-10-03 03:52:20 +0200245
Alexander Belopolsky40018472011-02-26 01:02:56 +0000246static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200248static PyObject *
249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
250static PyObject *
251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
252
253static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000255 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100256 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259static void
260raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300261 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100262 PyObject *unicode,
263 Py_ssize_t startpos, Py_ssize_t endpos,
264 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000265
Christian Heimes190d79e2008-01-30 11:58:22 +0000266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* 0x000B, * LINE TABULATION */
271/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000273 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* 0x001C, * FILE SEPARATOR */
276/* 0x001D, * GROUP SEPARATOR */
277/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 1, 1, 1, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000283
Benjamin Peterson14339b62009-01-31 16:36:08 +0000284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000292};
293
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000297PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000299#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 /* This is actually an illegal character, so it should
303 not be passed to unichr. */
304 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000305#endif
306}
307
Victor Stinner910337b2011-10-03 03:20:16 +0200308#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200309int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100310_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200311{
312 PyASCIIObject *ascii;
313 unsigned int kind;
314
315 assert(PyUnicode_Check(op));
316
317 ascii = (PyASCIIObject *)op;
318 kind = ascii->state.kind;
319
Victor Stinnera3b334d2011-10-03 13:53:37 +0200320 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(ascii->state.ready == 1);
323 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200325 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200326 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200327
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 if (ascii->state.compact == 1) {
329 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200334 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 }
337 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340 data = unicode->data.any;
341 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->length == 0);
343 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ascii == 0);
346 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 assert(ascii->wstr != NULL);
349 assert(data == NULL);
350 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200351 }
352 else {
353 assert(kind == PyUnicode_1BYTE_KIND
354 || kind == PyUnicode_2BYTE_KIND
355 || kind == PyUnicode_4BYTE_KIND);
356 assert(ascii->state.compact == 0);
357 assert(ascii->state.ready == 1);
358 assert(data != NULL);
359 if (ascii->state.ascii) {
360 assert (compact->utf8 == data);
361 assert (compact->utf8_length == ascii->length);
362 }
363 else
364 assert (compact->utf8 != data);
365 }
366 }
367 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 if (
369#if SIZEOF_WCHAR_T == 2
370 kind == PyUnicode_2BYTE_KIND
371#else
372 kind == PyUnicode_4BYTE_KIND
373#endif
374 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 {
376 assert(ascii->wstr == data);
377 assert(compact->wstr_length == ascii->length);
378 } else
379 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200381
382 if (compact->utf8 == NULL)
383 assert(compact->utf8_length == 0);
384 if (ascii->wstr == NULL)
385 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 /* check that the best kind is used */
388 if (check_content && kind != PyUnicode_WCHAR_KIND)
389 {
390 Py_ssize_t i;
391 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 void *data;
393 Py_UCS4 ch;
394
395 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 for (i=0; i < ascii->length; i++)
397 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200398 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200399 if (ch > maxchar)
400 maxchar = ch;
401 }
402 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100403 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 255);
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 else
408 assert(maxchar < 128);
409 }
Victor Stinner77faf692011-11-20 18:56:05 +0100410 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100412 assert(maxchar <= 0xFFFF);
413 }
414 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200415 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100416 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200418 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200419 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400420 return 1;
421}
Victor Stinner910337b2011-10-03 03:20:16 +0200422#endif
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424static PyObject*
425unicode_result_wchar(PyObject *unicode)
426{
427#ifndef Py_DEBUG
428 Py_ssize_t len;
429
430 assert(Py_REFCNT(unicode) == 1);
431
432 len = _PyUnicode_WSTR_LENGTH(unicode);
433 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100434 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200435 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 }
437
438 if (len == 1) {
439 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100440 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
442 Py_DECREF(unicode);
443 return latin1_char;
444 }
445 }
446
447 if (_PyUnicode_Ready(unicode) < 0) {
448 Py_XDECREF(unicode);
449 return NULL;
450 }
451#else
452 /* don't make the result ready in debug mode to ensure that the caller
453 makes the string ready before using it */
454 assert(_PyUnicode_CheckConsistency(unicode, 1));
455#endif
456 return unicode;
457}
458
459static PyObject*
460unicode_result_ready(PyObject *unicode)
461{
462 Py_ssize_t length;
463
464 length = PyUnicode_GET_LENGTH(unicode);
465 if (length == 0) {
466 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100467 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200468 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100469 }
470 return unicode_empty;
471 }
472
473 if (length == 1) {
474 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
475 if (ch < 256) {
476 PyObject *latin1_char = unicode_latin1[ch];
477 if (latin1_char != NULL) {
478 if (unicode != latin1_char) {
479 Py_INCREF(latin1_char);
480 Py_DECREF(unicode);
481 }
482 return latin1_char;
483 }
484 else {
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 Py_INCREF(unicode);
487 unicode_latin1[ch] = unicode;
488 return unicode;
489 }
490 }
491 }
492
493 assert(_PyUnicode_CheckConsistency(unicode, 1));
494 return unicode;
495}
496
497static PyObject*
498unicode_result(PyObject *unicode)
499{
500 assert(_PyUnicode_CHECK(unicode));
501 if (PyUnicode_IS_READY(unicode))
502 return unicode_result_ready(unicode);
503 else
504 return unicode_result_wchar(unicode);
505}
506
Victor Stinnerc4b49542011-12-11 22:44:26 +0100507static PyObject*
508unicode_result_unchanged(PyObject *unicode)
509{
510 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500511 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512 return NULL;
513 Py_INCREF(unicode);
514 return unicode;
515 }
516 else
517 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100518 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100519}
520
Victor Stinner3a50e702011-10-18 21:21:00 +0200521#ifdef HAVE_MBCS
522static OSVERSIONINFOEX winver;
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525/* --- Bloom Filters ----------------------------------------------------- */
526
527/* stuff to implement simple "bloom filters" for Unicode characters.
528 to keep things simple, we use a single bitmask, using the least 5
529 bits from each unicode characters as the bit index. */
530
531/* the linebreak mask is set up by Unicode_Init below */
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#if LONG_BIT >= 128
534#define BLOOM_WIDTH 128
535#elif LONG_BIT >= 64
536#define BLOOM_WIDTH 64
537#elif LONG_BIT >= 32
538#define BLOOM_WIDTH 32
539#else
540#error "LONG_BIT is smaller than 32"
541#endif
542
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543#define BLOOM_MASK unsigned long
544
Serhiy Storchaka05997252013-01-26 12:14:02 +0200545static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546
Antoine Pitrouf068f942010-01-13 14:19:12 +0000547#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
557 /* calculate simple bloom-style bitmask for a given unicode string */
558
Antoine Pitrouf068f942010-01-13 14:19:12 +0000559 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000560 Py_ssize_t i;
561
562 mask = 0;
563 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565
566 return mask;
567}
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569#define BLOOM_MEMBER(mask, chr, str) \
570 (BLOOM(mask, chr) \
571 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200573/* Compilation of templated routines */
574
575#include "stringlib/asciilib.h"
576#include "stringlib/fastsearch.h"
577#include "stringlib/partition.h"
578#include "stringlib/split.h"
579#include "stringlib/count.h"
580#include "stringlib/find.h"
581#include "stringlib/find_max_char.h"
582#include "stringlib/localeutil.h"
583#include "stringlib/undef.h"
584
585#include "stringlib/ucs1lib.h"
586#include "stringlib/fastsearch.h"
587#include "stringlib/partition.h"
588#include "stringlib/split.h"
589#include "stringlib/count.h"
590#include "stringlib/find.h"
591#include "stringlib/find_max_char.h"
592#include "stringlib/localeutil.h"
593#include "stringlib/undef.h"
594
595#include "stringlib/ucs2lib.h"
596#include "stringlib/fastsearch.h"
597#include "stringlib/partition.h"
598#include "stringlib/split.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
601#include "stringlib/find_max_char.h"
602#include "stringlib/localeutil.h"
603#include "stringlib/undef.h"
604
605#include "stringlib/ucs4lib.h"
606#include "stringlib/fastsearch.h"
607#include "stringlib/partition.h"
608#include "stringlib/split.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200615#include "stringlib/unicodedefs.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100619#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621/* --- Unicode Object ----------------------------------------------------- */
622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200624fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200626Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
627 Py_ssize_t size, Py_UCS4 ch,
628 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200630 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
631
632 switch (kind) {
633 case PyUnicode_1BYTE_KIND:
634 {
635 Py_UCS1 ch1 = (Py_UCS1) ch;
636 if (ch1 == ch)
637 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_2BYTE_KIND:
642 {
643 Py_UCS2 ch2 = (Py_UCS2) ch;
644 if (ch2 == ch)
645 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
646 else
647 return -1;
648 }
649 case PyUnicode_4BYTE_KIND:
650 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
651 default:
652 assert(0);
653 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655}
656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657static PyObject*
658resize_compact(PyObject *unicode, Py_ssize_t length)
659{
660 Py_ssize_t char_size;
661 Py_ssize_t struct_size;
662 Py_ssize_t new_size;
663 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100664 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200665 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100667 assert(PyUnicode_IS_COMPACT(unicode));
668
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200669 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100670 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 struct_size = sizeof(PyASCIIObject);
672 else
673 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200674 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
677 PyErr_NoMemory();
678 return NULL;
679 }
680 new_size = (struct_size + (length + 1) * char_size);
681
Serhiy Storchaka31b94102015-12-03 01:02:03 +0200682 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
683 PyObject_DEL(_PyUnicode_UTF8(unicode));
684 _PyUnicode_UTF8(unicode) = NULL;
685 _PyUnicode_UTF8_LENGTH(unicode) = 0;
686 }
Victor Stinner84def372011-12-11 20:04:56 +0100687 _Py_DEC_REFTOTAL;
688 _Py_ForgetReference(unicode);
689
690 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
691 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100692 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 PyErr_NoMemory();
694 return NULL;
695 }
Victor Stinner84def372011-12-11 20:04:56 +0100696 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200700 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100702 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200703 _PyUnicode_WSTR_LENGTH(unicode) = length;
704 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100705 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
706 PyObject_DEL(_PyUnicode_WSTR(unicode));
707 _PyUnicode_WSTR(unicode) = NULL;
708 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
710 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200711 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 return unicode;
713}
714
Alexander Belopolsky40018472011-02-26 01:02:56 +0000715static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200716resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717{
Victor Stinner95663112011-10-04 01:03:50 +0200718 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100719 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 if (PyUnicode_IS_READY(unicode)) {
724 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200725 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 void *data;
727
728 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200729 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
731 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732
733 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
734 PyErr_NoMemory();
735 return -1;
736 }
737 new_size = (length + 1) * char_size;
738
Victor Stinner7a9105a2011-12-12 00:13:42 +0100739 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
740 {
741 PyObject_DEL(_PyUnicode_UTF8(unicode));
742 _PyUnicode_UTF8(unicode) = NULL;
743 _PyUnicode_UTF8_LENGTH(unicode) = 0;
744 }
745
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 data = (PyObject *)PyObject_REALLOC(data, new_size);
747 if (data == NULL) {
748 PyErr_NoMemory();
749 return -1;
750 }
751 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200754 _PyUnicode_WSTR_LENGTH(unicode) = length;
755 }
756 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200757 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 _PyUnicode_UTF8_LENGTH(unicode) = length;
759 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 _PyUnicode_LENGTH(unicode) = length;
761 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200762 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200763 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinner95663112011-10-04 01:03:50 +0200767 assert(_PyUnicode_WSTR(unicode) != NULL);
768
769 /* check for integer overflow */
770 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
771 PyErr_NoMemory();
772 return -1;
773 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100774 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200775 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100776 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200777 if (!wstr) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 _PyUnicode_WSTR(unicode) = wstr;
782 _PyUnicode_WSTR(unicode)[length] = 0;
783 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200784 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785 return 0;
786}
787
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788static PyObject*
789resize_copy(PyObject *unicode, Py_ssize_t length)
790{
791 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100792 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100794
Benjamin Petersonbac79492012-01-14 13:34:47 -0500795 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100796 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797
798 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
799 if (copy == NULL)
800 return NULL;
801
802 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200803 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200805 }
806 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200807 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100808
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200809 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 if (w == NULL)
811 return NULL;
812 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
813 copy_length = Py_MIN(copy_length, length);
814 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
815 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
818}
819
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000821 Ux0000 terminated; some code (e.g. new_identifier)
822 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823
824 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000825 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826
827*/
828
Alexander Belopolsky40018472011-02-26 01:02:56 +0000829static PyUnicodeObject *
830_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831{
832 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Thomas Wouters477c8d52006-05-27 19:21:47 +0000835 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (length == 0 && unicode_empty != NULL) {
837 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200838 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 }
840
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000841 /* Ensure we won't overflow the size. */
842 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
843 return (PyUnicodeObject *)PyErr_NoMemory();
844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845 if (length < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to _PyUnicode_New");
848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000849 }
850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
852 if (unicode == NULL)
853 return NULL;
854 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
855 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
856 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100857 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000858 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100859 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861
Jeremy Hyltond8082792003-09-16 19:41:39 +0000862 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000863 * the caller fails before initializing str -- unicode_resize()
864 * reads str[0], and the Keep-Alive optimization can keep memory
865 * allocated for str alive across a call to unicode_dealloc(unicode).
866 * We don't want unicode_resize to read uninitialized memory in
867 * that case.
868 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869 _PyUnicode_WSTR(unicode)[0] = 0;
870 _PyUnicode_WSTR(unicode)[length] = 0;
871 _PyUnicode_WSTR_LENGTH(unicode) = length;
872 _PyUnicode_HASH(unicode) = -1;
873 _PyUnicode_STATE(unicode).interned = 0;
874 _PyUnicode_STATE(unicode).kind = 0;
875 _PyUnicode_STATE(unicode).compact = 0;
876 _PyUnicode_STATE(unicode).ready = 0;
877 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200878 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200880 _PyUnicode_UTF8(unicode) = NULL;
881 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100882 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 return unicode;
884}
885
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886static const char*
887unicode_kind_name(PyObject *unicode)
888{
Victor Stinner42dfd712011-10-03 14:41:45 +0200889 /* don't check consistency: unicode_kind_name() is called from
890 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 if (!PyUnicode_IS_COMPACT(unicode))
892 {
893 if (!PyUnicode_IS_READY(unicode))
894 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 {
897 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 return "legacy ascii";
900 else
901 return "legacy latin1";
902 case PyUnicode_2BYTE_KIND:
903 return "legacy UCS2";
904 case PyUnicode_4BYTE_KIND:
905 return "legacy UCS4";
906 default:
907 return "<legacy invalid kind>";
908 }
909 }
910 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600911 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200913 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200914 return "ascii";
915 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200916 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200917 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200918 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200919 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200920 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 default:
922 return "<invalid compact kind>";
923 }
924}
925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927/* Functions wrapping macros for use in debugger */
928char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200929 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930}
931
932void *_PyUnicode_compact_data(void *unicode) {
933 return _PyUnicode_COMPACT_DATA(unicode);
934}
935void *_PyUnicode_data(void *unicode){
936 printf("obj %p\n", unicode);
937 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
938 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
939 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
940 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
941 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
942 return PyUnicode_DATA(unicode);
943}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200944
945void
946_PyUnicode_Dump(PyObject *op)
947{
948 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
950 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
951 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200952
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200954 {
955 if (ascii->state.ascii)
956 data = (ascii + 1);
957 else
958 data = (compact + 1);
959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 else
961 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200962 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
963
Victor Stinnera849a4b2011-10-03 12:12:11 +0200964 if (ascii->wstr == data)
965 printf("shared ");
966 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200967
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 printf(" (%zu), ", compact->wstr_length);
970 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
971 printf("shared ");
972 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200973 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200974 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200975}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#endif
977
978PyObject *
979PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
980{
981 PyObject *obj;
982 PyCompactUnicodeObject *unicode;
983 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200984 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 Py_ssize_t char_size;
987 Py_ssize_t struct_size;
988
989 /* Optimization for empty strings */
990 if (size == 0 && unicode_empty != NULL) {
991 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200992 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 }
994
Victor Stinner9e9d6892011-10-04 01:02:02 +0200995 is_ascii = 0;
996 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 struct_size = sizeof(PyCompactUnicodeObject);
998 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200999 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 char_size = 1;
1001 is_ascii = 1;
1002 struct_size = sizeof(PyASCIIObject);
1003 }
1004 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001005 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 char_size = 1;
1007 }
1008 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001009 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001010 char_size = 2;
1011 if (sizeof(wchar_t) == 2)
1012 is_sharing = 1;
1013 }
1014 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001015 if (maxchar > MAX_UNICODE) {
1016 PyErr_SetString(PyExc_SystemError,
1017 "invalid maximum character passed to PyUnicode_New");
1018 return NULL;
1019 }
Victor Stinner8f825062012-04-27 13:55:39 +02001020 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 char_size = 4;
1022 if (sizeof(wchar_t) == 4)
1023 is_sharing = 1;
1024 }
1025
1026 /* Ensure we won't overflow the size. */
1027 if (size < 0) {
1028 PyErr_SetString(PyExc_SystemError,
1029 "Negative size passed to PyUnicode_New");
1030 return NULL;
1031 }
1032 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1033 return PyErr_NoMemory();
1034
1035 /* Duplicated allocation code from _PyObject_New() instead of a call to
1036 * PyObject_New() so we are able to allocate space for the object and
1037 * it's data buffer.
1038 */
1039 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1040 if (obj == NULL)
1041 return PyErr_NoMemory();
1042 obj = PyObject_INIT(obj, &PyUnicode_Type);
1043 if (obj == NULL)
1044 return NULL;
1045
1046 unicode = (PyCompactUnicodeObject *)obj;
1047 if (is_ascii)
1048 data = ((PyASCIIObject*)obj) + 1;
1049 else
1050 data = unicode + 1;
1051 _PyUnicode_LENGTH(unicode) = size;
1052 _PyUnicode_HASH(unicode) = -1;
1053 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001054 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 _PyUnicode_STATE(unicode).compact = 1;
1056 _PyUnicode_STATE(unicode).ready = 1;
1057 _PyUnicode_STATE(unicode).ascii = is_ascii;
1058 if (is_ascii) {
1059 ((char*)data)[size] = 0;
1060 _PyUnicode_WSTR(unicode) = NULL;
1061 }
Victor Stinner8f825062012-04-27 13:55:39 +02001062 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 ((char*)data)[size] = 0;
1064 _PyUnicode_WSTR(unicode) = NULL;
1065 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001067 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 else {
1070 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001071 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001072 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001074 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 ((Py_UCS4*)data)[size] = 0;
1076 if (is_sharing) {
1077 _PyUnicode_WSTR_LENGTH(unicode) = size;
1078 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1079 }
1080 else {
1081 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1082 _PyUnicode_WSTR(unicode) = NULL;
1083 }
1084 }
Victor Stinner8f825062012-04-27 13:55:39 +02001085#ifdef Py_DEBUG
1086 /* Fill the data with invalid characters to detect bugs earlier.
1087 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1088 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1089 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1090 memset(data, 0xff, size * kind);
1091#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001092 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 return obj;
1094}
1095
1096#if SIZEOF_WCHAR_T == 2
1097/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1098 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001099 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100
1101 This function assumes that unicode can hold one more code point than wstr
1102 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001103static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001105 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106{
1107 const wchar_t *iter;
1108 Py_UCS4 *ucs4_out;
1109
Victor Stinner910337b2011-10-03 03:20:16 +02001110 assert(unicode != NULL);
1111 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1113 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1114
1115 for (iter = begin; iter < end; ) {
1116 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1117 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001118 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1119 && (iter+1) < end
1120 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 {
Victor Stinner551ac952011-11-29 22:58:13 +01001122 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 iter += 2;
1124 }
1125 else {
1126 *ucs4_out++ = *iter;
1127 iter++;
1128 }
1129 }
1130 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1131 _PyUnicode_GET_LENGTH(unicode)));
1132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133}
1134#endif
1135
Victor Stinnercd9950f2011-10-02 00:34:53 +02001136static int
Victor Stinner488fa492011-12-12 00:01:39 +01001137unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001138{
Victor Stinner488fa492011-12-12 00:01:39 +01001139 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001140 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001141 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001142 return -1;
1143 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001144 return 0;
1145}
1146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147static int
1148_copy_characters(PyObject *to, Py_ssize_t to_start,
1149 PyObject *from, Py_ssize_t from_start,
1150 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001152 unsigned int from_kind, to_kind;
1153 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154
Victor Stinneree4544c2012-05-09 22:24:08 +02001155 assert(0 <= how_many);
1156 assert(0 <= from_start);
1157 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001158 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001159 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161
Victor Stinnerd3f08822012-05-29 12:57:52 +02001162 assert(PyUnicode_Check(to));
1163 assert(PyUnicode_IS_READY(to));
1164 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1165
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001166 if (how_many == 0)
1167 return 0;
1168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001170 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001172 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173
Victor Stinnerf1852262012-06-16 16:38:26 +02001174#ifdef Py_DEBUG
1175 if (!check_maxchar
1176 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1177 {
1178 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1179 Py_UCS4 ch;
1180 Py_ssize_t i;
1181 for (i=0; i < how_many; i++) {
1182 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1183 assert(ch <= to_maxchar);
1184 }
1185 }
1186#endif
1187
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001188 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001189 if (check_maxchar
1190 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1191 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001192 /* Writing Latin-1 characters into an ASCII string requires to
1193 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 Py_UCS4 max_char;
1195 max_char = ucs1lib_find_max_char(from_data,
1196 (Py_UCS1*)from_data + how_many);
1197 if (max_char >= 128)
1198 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001199 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001200 Py_MEMCPY((char*)to_data + to_kind * to_start,
1201 (char*)from_data + from_kind * from_start,
1202 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else if (from_kind == PyUnicode_1BYTE_KIND
1205 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 {
1207 _PyUnicode_CONVERT_BYTES(
1208 Py_UCS1, Py_UCS2,
1209 PyUnicode_1BYTE_DATA(from) + from_start,
1210 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1211 PyUnicode_2BYTE_DATA(to) + to_start
1212 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001213 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001214 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001215 && to_kind == PyUnicode_4BYTE_KIND)
1216 {
1217 _PyUnicode_CONVERT_BYTES(
1218 Py_UCS1, Py_UCS4,
1219 PyUnicode_1BYTE_DATA(from) + from_start,
1220 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1221 PyUnicode_4BYTE_DATA(to) + to_start
1222 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001223 }
1224 else if (from_kind == PyUnicode_2BYTE_KIND
1225 && to_kind == PyUnicode_4BYTE_KIND)
1226 {
1227 _PyUnicode_CONVERT_BYTES(
1228 Py_UCS2, Py_UCS4,
1229 PyUnicode_2BYTE_DATA(from) + from_start,
1230 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1231 PyUnicode_4BYTE_DATA(to) + to_start
1232 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001233 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1236
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001237 if (!check_maxchar) {
1238 if (from_kind == PyUnicode_2BYTE_KIND
1239 && to_kind == PyUnicode_1BYTE_KIND)
1240 {
1241 _PyUnicode_CONVERT_BYTES(
1242 Py_UCS2, Py_UCS1,
1243 PyUnicode_2BYTE_DATA(from) + from_start,
1244 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1245 PyUnicode_1BYTE_DATA(to) + to_start
1246 );
1247 }
1248 else if (from_kind == PyUnicode_4BYTE_KIND
1249 && to_kind == PyUnicode_1BYTE_KIND)
1250 {
1251 _PyUnicode_CONVERT_BYTES(
1252 Py_UCS4, Py_UCS1,
1253 PyUnicode_4BYTE_DATA(from) + from_start,
1254 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1255 PyUnicode_1BYTE_DATA(to) + to_start
1256 );
1257 }
1258 else if (from_kind == PyUnicode_4BYTE_KIND
1259 && to_kind == PyUnicode_2BYTE_KIND)
1260 {
1261 _PyUnicode_CONVERT_BYTES(
1262 Py_UCS4, Py_UCS2,
1263 PyUnicode_4BYTE_DATA(from) + from_start,
1264 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1265 PyUnicode_2BYTE_DATA(to) + to_start
1266 );
1267 }
1268 else {
1269 assert(0);
1270 return -1;
1271 }
1272 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001273 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001275 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001276 Py_ssize_t i;
1277
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 for (i=0; i < how_many; i++) {
1279 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (ch > to_maxchar)
1281 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 }
1285 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001286 return 0;
1287}
1288
Victor Stinnerd3f08822012-05-29 12:57:52 +02001289void
1290_PyUnicode_FastCopyCharacters(
1291 PyObject *to, Py_ssize_t to_start,
1292 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001293{
1294 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1295}
1296
1297Py_ssize_t
1298PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1299 PyObject *from, Py_ssize_t from_start,
1300 Py_ssize_t how_many)
1301{
1302 int err;
1303
1304 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1305 PyErr_BadInternalCall();
1306 return -1;
1307 }
1308
Benjamin Petersonbac79492012-01-14 13:34:47 -05001309 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001310 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001311 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001312 return -1;
1313
Victor Stinnerd3f08822012-05-29 12:57:52 +02001314 if (from_start < 0) {
1315 PyErr_SetString(PyExc_IndexError, "string index out of range");
1316 return -1;
1317 }
1318 if (to_start < 0) {
1319 PyErr_SetString(PyExc_IndexError, "string index out of range");
1320 return -1;
1321 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001322 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1323 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1324 PyErr_Format(PyExc_SystemError,
1325 "Cannot write %zi characters at %zi "
1326 "in a string of %zi characters",
1327 how_many, to_start, PyUnicode_GET_LENGTH(to));
1328 return -1;
1329 }
1330
1331 if (how_many == 0)
1332 return 0;
1333
Victor Stinner488fa492011-12-12 00:01:39 +01001334 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001335 return -1;
1336
1337 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1338 if (err) {
1339 PyErr_Format(PyExc_SystemError,
1340 "Cannot copy %s characters "
1341 "into a string of %s characters",
1342 unicode_kind_name(from),
1343 unicode_kind_name(to));
1344 return -1;
1345 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001346 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347}
1348
Victor Stinner17222162011-09-28 22:15:37 +02001349/* Find the maximum code point and count the number of surrogate pairs so a
1350 correct string length can be computed before converting a string to UCS4.
1351 This function counts single surrogates as a character and not as a pair.
1352
1353 Return 0 on success, or -1 on error. */
1354static int
1355find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1356 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357{
1358 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001359 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360
Victor Stinnerc53be962011-10-02 21:33:54 +02001361 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 *num_surrogates = 0;
1363 *maxchar = 0;
1364
1365 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001367 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1368 && (iter+1) < end
1369 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001371 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373 iter += 2;
1374 }
1375 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001377 {
1378 ch = *iter;
1379 iter++;
1380 }
1381 if (ch > *maxchar) {
1382 *maxchar = ch;
1383 if (*maxchar > MAX_UNICODE) {
1384 PyErr_Format(PyExc_ValueError,
1385 "character U+%x is not in range [U+0000; U+10ffff]",
1386 ch);
1387 return -1;
1388 }
1389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 return 0;
1392}
1393
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001394int
1395_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396{
1397 wchar_t *end;
1398 Py_UCS4 maxchar = 0;
1399 Py_ssize_t num_surrogates;
1400#if SIZEOF_WCHAR_T == 2
1401 Py_ssize_t length_wo_surrogates;
1402#endif
1403
Georg Brandl7597add2011-10-05 16:36:47 +02001404 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001405 strings were created using _PyObject_New() and where no canonical
1406 representation (the str field) has been set yet aka strings
1407 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001408 assert(_PyUnicode_CHECK(unicode));
1409 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001412 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001413 /* Actually, it should neither be interned nor be anything else: */
1414 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001417 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
1421 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1423 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 PyErr_NoMemory();
1425 return -1;
1426 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001427 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 _PyUnicode_WSTR(unicode), end,
1429 PyUnicode_1BYTE_DATA(unicode));
1430 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1431 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1432 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1433 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001434 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001435 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 PyObject_FREE(_PyUnicode_WSTR(unicode));
1444 _PyUnicode_WSTR(unicode) = NULL;
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446 }
1447 /* In this case we might have to convert down from 4-byte native
1448 wchar_t to 2-byte unicode. */
1449 else if (maxchar < 65536) {
1450 assert(num_surrogates == 0 &&
1451 "FindMaxCharAndNumSurrogatePairs() messed up");
1452
Victor Stinner506f5922011-09-28 22:34:18 +02001453#if SIZEOF_WCHAR_T == 2
1454 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001456 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1457 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1458 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 _PyUnicode_UTF8(unicode) = NULL;
1460 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001461#else
1462 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001464 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001466 PyErr_NoMemory();
1467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 }
Victor Stinner506f5922011-09-28 22:34:18 +02001469 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1470 _PyUnicode_WSTR(unicode), end,
1471 PyUnicode_2BYTE_DATA(unicode));
1472 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1474 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001475 _PyUnicode_UTF8(unicode) = NULL;
1476 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001477 PyObject_FREE(_PyUnicode_WSTR(unicode));
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
1482 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1483 else {
1484#if SIZEOF_WCHAR_T == 2
1485 /* in case the native representation is 2-bytes, we need to allocate a
1486 new normalized 4-byte version. */
1487 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001488 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1489 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 PyErr_NoMemory();
1491 return -1;
1492 }
1493 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1494 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001497 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1498 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001499 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 PyObject_FREE(_PyUnicode_WSTR(unicode));
1501 _PyUnicode_WSTR(unicode) = NULL;
1502 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1503#else
1504 assert(num_surrogates == 0);
1505
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001508 _PyUnicode_UTF8(unicode) = NULL;
1509 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1511#endif
1512 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1513 }
1514 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001515 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 return 0;
1517}
1518
Alexander Belopolsky40018472011-02-26 01:02:56 +00001519static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001520unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521{
Walter Dörwald16807132007-05-25 13:52:07 +00001522 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_NOT_INTERNED:
1524 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001525
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 case SSTATE_INTERNED_MORTAL:
1527 /* revive dead object temporarily for DelItem */
1528 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001529 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001530 Py_FatalError(
1531 "deletion of interned string failed");
1532 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 case SSTATE_INTERNED_IMMORTAL:
1535 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536
Benjamin Peterson29060642009-01-31 22:14:21 +00001537 default:
1538 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001539 }
1540
Victor Stinner03490912011-10-03 23:45:12 +02001541 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001543 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001544 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1546 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001548 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549}
1550
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001551#ifdef Py_DEBUG
1552static int
1553unicode_is_singleton(PyObject *unicode)
1554{
1555 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1556 if (unicode == unicode_empty)
1557 return 1;
1558 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1559 {
1560 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1561 if (ch < 256 && unicode_latin1[ch] == unicode)
1562 return 1;
1563 }
1564 return 0;
1565}
1566#endif
1567
Alexander Belopolsky40018472011-02-26 01:02:56 +00001568static int
Victor Stinner488fa492011-12-12 00:01:39 +01001569unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001570{
Victor Stinner488fa492011-12-12 00:01:39 +01001571 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 if (Py_REFCNT(unicode) != 1)
1573 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001574 if (_PyUnicode_HASH(unicode) != -1)
1575 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 if (PyUnicode_CHECK_INTERNED(unicode))
1577 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001578 if (!PyUnicode_CheckExact(unicode))
1579 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001581 /* singleton refcount is greater than 1 */
1582 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001583#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 1;
1585}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587static int
1588unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1589{
1590 PyObject *unicode;
1591 Py_ssize_t old_length;
1592
1593 assert(p_unicode != NULL);
1594 unicode = *p_unicode;
1595
1596 assert(unicode != NULL);
1597 assert(PyUnicode_Check(unicode));
1598 assert(0 <= length);
1599
Victor Stinner910337b2011-10-03 03:20:16 +02001600 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 old_length = PyUnicode_WSTR_LENGTH(unicode);
1602 else
1603 old_length = PyUnicode_GET_LENGTH(unicode);
1604 if (old_length == length)
1605 return 0;
1606
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001607 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001608 _Py_INCREF_UNICODE_EMPTY();
1609 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611 Py_DECREF(*p_unicode);
1612 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001613 return 0;
1614 }
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 PyObject *copy = resize_copy(unicode, length);
1618 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001619 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 Py_DECREF(*p_unicode);
1621 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001623 }
1624
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001626 PyObject *new_unicode = resize_compact(unicode, length);
1627 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001631 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001632 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633}
1634
Alexander Belopolsky40018472011-02-26 01:02:56 +00001635int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001637{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001638 PyObject *unicode;
1639 if (p_unicode == NULL) {
1640 PyErr_BadInternalCall();
1641 return -1;
1642 }
1643 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001644 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001645 {
1646 PyErr_BadInternalCall();
1647 return -1;
1648 }
1649 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001650}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001651
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001653unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1654 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655{
1656 PyObject *result;
1657 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001658 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1660 return 0;
1661 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1662 maxchar);
1663 if (result == NULL)
1664 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001665 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001666 Py_DECREF(*p_unicode);
1667 *p_unicode = result;
1668 return 0;
1669}
1670
1671static int
1672unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1673 Py_UCS4 ch)
1674{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001675 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001676 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001677 return -1;
1678 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1679 PyUnicode_DATA(*p_unicode),
1680 (*pos)++, ch);
1681 return 0;
1682}
1683
Victor Stinnerc5166102012-02-22 13:55:02 +01001684/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001685
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001686 WARNING: The function doesn't copy the terminating null character and
1687 doesn't check the maximum character (may write a latin1 character in an
1688 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001689static void
1690unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1691 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692{
1693 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1694 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001695 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
1697 switch (kind) {
1698 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001699 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001700 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001701 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 }
1703 case PyUnicode_2BYTE_KIND: {
1704 Py_UCS2 *start = (Py_UCS2 *)data + index;
1705 Py_UCS2 *ucs2 = start;
1706 assert(index <= PyUnicode_GET_LENGTH(unicode));
1707
Victor Stinner184252a2012-06-16 02:57:41 +02001708 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001709 *ucs2 = (Py_UCS2)*str;
1710
1711 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 default: {
1715 Py_UCS4 *start = (Py_UCS4 *)data + index;
1716 Py_UCS4 *ucs4 = start;
1717 assert(kind == PyUnicode_4BYTE_KIND);
1718 assert(index <= PyUnicode_GET_LENGTH(unicode));
1719
Victor Stinner184252a2012-06-16 02:57:41 +02001720 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 *ucs4 = (Py_UCS4)*str;
1722
1723 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 }
1726}
1727
1728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729static PyObject*
1730get_latin1_char(unsigned char ch)
1731{
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001734 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 if (!unicode)
1736 return NULL;
1737 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001738 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 unicode_latin1[ch] = unicode;
1740 }
1741 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001742 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743}
1744
Alexander Belopolsky40018472011-02-26 01:02:56 +00001745PyObject *
1746PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001748 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 Py_UCS4 maxchar = 0;
1750 Py_ssize_t num_surrogates;
1751
1752 if (u == NULL)
1753 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001755 /* If the Unicode data is known at construction time, we can apply
1756 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001759 if (size == 0)
1760 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 /* Single character Unicode objects in the Latin-1 range are
1763 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001764 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 return get_latin1_char((unsigned char)*u);
1766
1767 /* If not empty and not single character, copy the Unicode data
1768 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001769 if (find_maxchar_surrogates(u, u + size,
1770 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 return NULL;
1772
Victor Stinner8faf8212011-12-08 22:14:11 +01001773 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 if (!unicode)
1775 return NULL;
1776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 switch (PyUnicode_KIND(unicode)) {
1778 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001779 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1781 break;
1782 case PyUnicode_2BYTE_KIND:
1783#if Py_UNICODE_SIZE == 2
1784 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1785#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001786 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1788#endif
1789 break;
1790 case PyUnicode_4BYTE_KIND:
1791#if SIZEOF_WCHAR_T == 2
1792 /* This is the only case which has to process surrogates, thus
1793 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001794 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795#else
1796 assert(num_surrogates == 0);
1797 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1798#endif
1799 break;
1800 default:
1801 assert(0 && "Impossible state");
1802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001804 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805}
1806
Alexander Belopolsky40018472011-02-26 01:02:56 +00001807PyObject *
1808PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001809{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001810 if (size < 0) {
1811 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 return NULL;
1814 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001815 if (u != NULL)
1816 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1817 else
1818 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001819}
1820
Alexander Belopolsky40018472011-02-26 01:02:56 +00001821PyObject *
1822PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001823{
1824 size_t size = strlen(u);
1825 if (size > PY_SSIZE_T_MAX) {
1826 PyErr_SetString(PyExc_OverflowError, "input too long");
1827 return NULL;
1828 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001829 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001830}
1831
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001832PyObject *
1833_PyUnicode_FromId(_Py_Identifier *id)
1834{
1835 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001836 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1837 strlen(id->string),
1838 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001839 if (!id->object)
1840 return NULL;
1841 PyUnicode_InternInPlace(&id->object);
1842 assert(!id->next);
1843 id->next = static_strings;
1844 static_strings = id;
1845 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001846 return id->object;
1847}
1848
1849void
1850_PyUnicode_ClearStaticStrings()
1851{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001852 _Py_Identifier *tmp, *s = static_strings;
1853 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001854 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001855 tmp = s->next;
1856 s->next = NULL;
1857 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001858 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001859 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001860}
1861
Benjamin Peterson0df54292012-03-26 14:50:32 -04001862/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001863
Victor Stinnerd3f08822012-05-29 12:57:52 +02001864PyObject*
1865_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001866{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001867 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001868 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001869 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001870#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001871 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001872#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001873 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001874 }
Victor Stinner785938e2011-12-11 20:09:03 +01001875 unicode = PyUnicode_New(size, 127);
1876 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001877 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001878 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1879 assert(_PyUnicode_CheckConsistency(unicode, 1));
1880 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001881}
1882
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001883static Py_UCS4
1884kind_maxchar_limit(unsigned int kind)
1885{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001886 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001887 case PyUnicode_1BYTE_KIND:
1888 return 0x80;
1889 case PyUnicode_2BYTE_KIND:
1890 return 0x100;
1891 case PyUnicode_4BYTE_KIND:
1892 return 0x10000;
1893 default:
1894 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001895 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001896 }
1897}
1898
Victor Stinnere6abb482012-05-02 01:15:40 +02001899Py_LOCAL_INLINE(Py_UCS4)
1900align_maxchar(Py_UCS4 maxchar)
1901{
1902 if (maxchar <= 127)
1903 return 127;
1904 else if (maxchar <= 255)
1905 return 255;
1906 else if (maxchar <= 65535)
1907 return 65535;
1908 else
1909 return MAX_UNICODE;
1910}
1911
Victor Stinner702c7342011-10-05 13:50:52 +02001912static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001913_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001914{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001917
Serhiy Storchaka678db842013-01-26 12:16:36 +02001918 if (size == 0)
1919 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001921 if (size == 1)
1922 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001923
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001924 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001925 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!res)
1927 return NULL;
1928 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001929 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001931}
1932
Victor Stinnere57b1c02011-09-28 22:20:48 +02001933static PyObject*
1934_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935{
1936 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938
Serhiy Storchaka678db842013-01-26 12:16:36 +02001939 if (size == 0)
1940 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001941 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001942 if (size == 1) {
1943 Py_UCS4 ch = u[0];
1944 if (ch < 256)
1945 return get_latin1_char((unsigned char)ch);
1946
1947 res = PyUnicode_New(1, ch);
1948 if (res == NULL)
1949 return NULL;
1950 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1951 assert(_PyUnicode_CheckConsistency(res, 1));
1952 return res;
1953 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001954
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001955 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001956 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 if (!res)
1958 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001959 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001960 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001961 else {
1962 _PyUnicode_CONVERT_BYTES(
1963 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1964 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001965 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 return res;
1967}
1968
Victor Stinnere57b1c02011-09-28 22:20:48 +02001969static PyObject*
1970_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971{
1972 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001973 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001974
Serhiy Storchaka678db842013-01-26 12:16:36 +02001975 if (size == 0)
1976 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001977 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001978 if (size == 1) {
1979 Py_UCS4 ch = u[0];
1980 if (ch < 256)
1981 return get_latin1_char((unsigned char)ch);
1982
1983 res = PyUnicode_New(1, ch);
1984 if (res == NULL)
1985 return NULL;
1986 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1987 assert(_PyUnicode_CheckConsistency(res, 1));
1988 return res;
1989 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001991 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001992 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (!res)
1994 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001995 if (max_char < 256)
1996 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1997 PyUnicode_1BYTE_DATA(res));
1998 else if (max_char < 0x10000)
1999 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2000 PyUnicode_2BYTE_DATA(res));
2001 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002003 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 return res;
2005}
2006
2007PyObject*
2008PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2009{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002010 if (size < 0) {
2011 PyErr_SetString(PyExc_ValueError, "size must be positive");
2012 return NULL;
2013 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002014 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002016 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002018 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002020 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002021 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002022 PyErr_SetString(PyExc_SystemError, "invalid kind");
2023 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025}
2026
Victor Stinnerece58de2012-04-23 23:36:38 +02002027Py_UCS4
2028_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2029{
2030 enum PyUnicode_Kind kind;
2031 void *startptr, *endptr;
2032
2033 assert(PyUnicode_IS_READY(unicode));
2034 assert(0 <= start);
2035 assert(end <= PyUnicode_GET_LENGTH(unicode));
2036 assert(start <= end);
2037
2038 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2039 return PyUnicode_MAX_CHAR_VALUE(unicode);
2040
2041 if (start == end)
2042 return 127;
2043
Victor Stinner94d558b2012-04-27 22:26:58 +02002044 if (PyUnicode_IS_ASCII(unicode))
2045 return 127;
2046
Victor Stinnerece58de2012-04-23 23:36:38 +02002047 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002048 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002049 endptr = (char *)startptr + end * kind;
2050 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002051 switch(kind) {
2052 case PyUnicode_1BYTE_KIND:
2053 return ucs1lib_find_max_char(startptr, endptr);
2054 case PyUnicode_2BYTE_KIND:
2055 return ucs2lib_find_max_char(startptr, endptr);
2056 case PyUnicode_4BYTE_KIND:
2057 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002058 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002059 assert(0);
2060 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002061 }
2062}
2063
Victor Stinner25a4b292011-10-06 12:31:55 +02002064/* Ensure that a string uses the most efficient storage, if it is not the
2065 case: create a new string with of the right kind. Write NULL into *p_unicode
2066 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002067static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002068unicode_adjust_maxchar(PyObject **p_unicode)
2069{
2070 PyObject *unicode, *copy;
2071 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002072 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002073 unsigned int kind;
2074
2075 assert(p_unicode != NULL);
2076 unicode = *p_unicode;
2077 assert(PyUnicode_IS_READY(unicode));
2078 if (PyUnicode_IS_ASCII(unicode))
2079 return;
2080
2081 len = PyUnicode_GET_LENGTH(unicode);
2082 kind = PyUnicode_KIND(unicode);
2083 if (kind == PyUnicode_1BYTE_KIND) {
2084 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002085 max_char = ucs1lib_find_max_char(u, u + len);
2086 if (max_char >= 128)
2087 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002088 }
2089 else if (kind == PyUnicode_2BYTE_KIND) {
2090 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002091 max_char = ucs2lib_find_max_char(u, u + len);
2092 if (max_char >= 256)
2093 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 }
2095 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002096 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002098 max_char = ucs4lib_find_max_char(u, u + len);
2099 if (max_char >= 0x10000)
2100 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002101 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002102 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002103 if (copy != NULL)
2104 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 Py_DECREF(unicode);
2106 *p_unicode = copy;
2107}
2108
Victor Stinner034f6cf2011-09-30 02:26:44 +02002109PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002110_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002111{
Victor Stinner87af4f22011-11-21 23:03:47 +01002112 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002113 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002114
Victor Stinner034f6cf2011-09-30 02:26:44 +02002115 if (!PyUnicode_Check(unicode)) {
2116 PyErr_BadInternalCall();
2117 return NULL;
2118 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002119 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002120 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002121
Victor Stinner87af4f22011-11-21 23:03:47 +01002122 length = PyUnicode_GET_LENGTH(unicode);
2123 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002124 if (!copy)
2125 return NULL;
2126 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2127
Victor Stinner87af4f22011-11-21 23:03:47 +01002128 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2129 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002130 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002131 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132}
2133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134
Victor Stinnerbc603d12011-10-02 01:00:40 +02002135/* Widen Unicode objects to larger buffers. Don't write terminating null
2136 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137
2138void*
2139_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2140{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002141 Py_ssize_t len;
2142 void *result;
2143 unsigned int skind;
2144
Benjamin Petersonbac79492012-01-14 13:34:47 -05002145 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002146 return NULL;
2147
2148 len = PyUnicode_GET_LENGTH(s);
2149 skind = PyUnicode_KIND(s);
2150 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002151 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 return NULL;
2153 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002154 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002155 case PyUnicode_2BYTE_KIND:
2156 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2157 if (!result)
2158 return PyErr_NoMemory();
2159 assert(skind == PyUnicode_1BYTE_KIND);
2160 _PyUnicode_CONVERT_BYTES(
2161 Py_UCS1, Py_UCS2,
2162 PyUnicode_1BYTE_DATA(s),
2163 PyUnicode_1BYTE_DATA(s) + len,
2164 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166 case PyUnicode_4BYTE_KIND:
2167 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2168 if (!result)
2169 return PyErr_NoMemory();
2170 if (skind == PyUnicode_2BYTE_KIND) {
2171 _PyUnicode_CONVERT_BYTES(
2172 Py_UCS2, Py_UCS4,
2173 PyUnicode_2BYTE_DATA(s),
2174 PyUnicode_2BYTE_DATA(s) + len,
2175 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177 else {
2178 assert(skind == PyUnicode_1BYTE_KIND);
2179 _PyUnicode_CONVERT_BYTES(
2180 Py_UCS1, Py_UCS4,
2181 PyUnicode_1BYTE_DATA(s),
2182 PyUnicode_1BYTE_DATA(s) + len,
2183 result);
2184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 default:
2187 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 }
Victor Stinner01698042011-10-04 00:04:26 +02002189 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 return NULL;
2191}
2192
2193static Py_UCS4*
2194as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2195 int copy_null)
2196{
2197 int kind;
2198 void *data;
2199 Py_ssize_t len, targetlen;
2200 if (PyUnicode_READY(string) == -1)
2201 return NULL;
2202 kind = PyUnicode_KIND(string);
2203 data = PyUnicode_DATA(string);
2204 len = PyUnicode_GET_LENGTH(string);
2205 targetlen = len;
2206 if (copy_null)
2207 targetlen++;
2208 if (!target) {
2209 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2210 PyErr_NoMemory();
2211 return NULL;
2212 }
2213 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2214 if (!target) {
2215 PyErr_NoMemory();
2216 return NULL;
2217 }
2218 }
2219 else {
2220 if (targetsize < targetlen) {
2221 PyErr_Format(PyExc_SystemError,
2222 "string is longer than the buffer");
2223 if (copy_null && 0 < targetsize)
2224 target[0] = 0;
2225 return NULL;
2226 }
2227 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002228 if (kind == PyUnicode_1BYTE_KIND) {
2229 Py_UCS1 *start = (Py_UCS1 *) data;
2230 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002232 else if (kind == PyUnicode_2BYTE_KIND) {
2233 Py_UCS2 *start = (Py_UCS2 *) data;
2234 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2235 }
2236 else {
2237 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 if (copy_null)
2241 target[len] = 0;
2242 return target;
2243}
2244
2245Py_UCS4*
2246PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2247 int copy_null)
2248{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002249 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 PyErr_BadInternalCall();
2251 return NULL;
2252 }
2253 return as_ucs4(string, target, targetsize, copy_null);
2254}
2255
2256Py_UCS4*
2257PyUnicode_AsUCS4Copy(PyObject *string)
2258{
2259 return as_ucs4(string, NULL, 0, 1);
2260}
2261
2262#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002263
Alexander Belopolsky40018472011-02-26 01:02:56 +00002264PyObject *
2265PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002266{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002269 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002270 PyErr_BadInternalCall();
2271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272 }
2273
Martin v. Löwis790465f2008-04-05 20:41:37 +00002274 if (size == -1) {
2275 size = wcslen(w);
2276 }
2277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279}
2280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002282
Walter Dörwald346737f2007-05-31 10:44:43 +00002283static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002284makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2285 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002286{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002287 *fmt++ = '%';
2288 if (width) {
2289 if (zeropad)
2290 *fmt++ = '0';
2291 fmt += sprintf(fmt, "%d", width);
2292 }
2293 if (precision)
2294 fmt += sprintf(fmt, ".%d", precision);
2295 if (longflag)
2296 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002297 else if (longlongflag) {
2298 /* longlongflag should only ever be nonzero on machines with
2299 HAVE_LONG_LONG defined */
2300#ifdef HAVE_LONG_LONG
2301 char *f = PY_FORMAT_LONG_LONG;
2302 while (*f)
2303 *fmt++ = *f++;
2304#else
2305 /* we shouldn't ever get here */
2306 assert(0);
2307 *fmt++ = 'l';
2308#endif
2309 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002310 else if (size_tflag) {
2311 char *f = PY_FORMAT_SIZE_T;
2312 while (*f)
2313 *fmt++ = *f++;
2314 }
2315 *fmt++ = c;
2316 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002317}
2318
Victor Stinner96865452011-03-01 23:44:09 +00002319/* helper for PyUnicode_FromFormatV() */
2320
2321static const char*
2322parse_format_flags(const char *f,
2323 int *p_width, int *p_precision,
2324 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2325{
2326 int width, precision, longflag, longlongflag, size_tflag;
2327
2328 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2329 f++;
2330 width = 0;
2331 while (Py_ISDIGIT((unsigned)*f))
2332 width = (width*10) + *f++ - '0';
2333 precision = 0;
2334 if (*f == '.') {
2335 f++;
2336 while (Py_ISDIGIT((unsigned)*f))
2337 precision = (precision*10) + *f++ - '0';
2338 if (*f == '%') {
2339 /* "%.3%s" => f points to "3" */
2340 f--;
2341 }
2342 }
Serhiy Storchaka4dbc3052015-01-27 22:18:46 +02002343 if (width < precision)
2344 width = precision;
Victor Stinner96865452011-03-01 23:44:09 +00002345 if (*f == '\0') {
2346 /* bogus format "%.1" => go backward, f points to "1" */
2347 f--;
2348 }
2349 if (p_width != NULL)
2350 *p_width = width;
2351 if (p_precision != NULL)
2352 *p_precision = precision;
2353
2354 /* Handle %ld, %lu, %lld and %llu. */
2355 longflag = 0;
2356 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002357 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002358
2359 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002360 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002361 longflag = 1;
2362 ++f;
2363 }
2364#ifdef HAVE_LONG_LONG
2365 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002366 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002367 longlongflag = 1;
2368 f += 2;
2369 }
2370#endif
2371 }
2372 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002373 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002374 size_tflag = 1;
2375 ++f;
2376 }
2377 if (p_longflag != NULL)
2378 *p_longflag = longflag;
2379 if (p_longlongflag != NULL)
2380 *p_longlongflag = longlongflag;
2381 if (p_size_tflag != NULL)
2382 *p_size_tflag = size_tflag;
2383 return f;
2384}
2385
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002386/* maximum number of characters required for output of %ld. 21 characters
2387 allows for 64-bit integers (in decimal) and an optional sign. */
2388#define MAX_LONG_CHARS 21
2389/* maximum number of characters required for output of %lld.
2390 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2391 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2392#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2393
Walter Dörwaldd2034312007-05-18 16:29:38 +00002394PyObject *
2395PyUnicode_FromFormatV(const char *format, va_list vargs)
2396{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002397 va_list count;
2398 Py_ssize_t callcount = 0;
2399 PyObject **callresults = NULL;
2400 PyObject **callresult = NULL;
2401 Py_ssize_t n = 0;
2402 int width = 0;
2403 int precision = 0;
2404 int zeropad;
2405 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002406 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002408 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2410 Py_UCS4 argmaxchar;
2411 Py_ssize_t numbersize = 0;
2412 char *numberresults = NULL;
2413 char *numberresult = NULL;
2414 Py_ssize_t i;
2415 int kind;
2416 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002417
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002418 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002419 /* step 1: count the number of %S/%R/%A/%s format specifications
2420 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2421 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002423 * also estimate a upper bound for all the number formats in the string,
2424 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 for (f = format; *f; f++) {
2427 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002428 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2430 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2431 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2432 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002435#ifdef HAVE_LONG_LONG
2436 if (longlongflag) {
2437 if (width < MAX_LONG_LONG_CHARS)
2438 width = MAX_LONG_LONG_CHARS;
2439 }
2440 else
2441#endif
2442 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2443 including sign. Decimal takes the most space. This
2444 isn't enough for octal. If a width is specified we
2445 need more (which we allocate later). */
2446 if (width < MAX_LONG_CHARS)
2447 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448
2449 /* account for the size + '\0' to separate numbers
2450 inside of the numberresults buffer */
2451 numbersize += (width + 1);
2452 }
2453 }
2454 else if ((unsigned char)*f > 127) {
2455 PyErr_Format(PyExc_ValueError,
2456 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2457 "string, got a non-ASCII byte: 0x%02x",
2458 (unsigned char)*f);
2459 return NULL;
2460 }
2461 }
2462 /* step 2: allocate memory for the results of
2463 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2464 if (callcount) {
2465 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2466 if (!callresults) {
2467 PyErr_NoMemory();
2468 return NULL;
2469 }
2470 callresult = callresults;
2471 }
2472 /* step 2.5: allocate memory for the results of formating numbers */
2473 if (numbersize) {
2474 numberresults = PyObject_Malloc(numbersize);
2475 if (!numberresults) {
2476 PyErr_NoMemory();
2477 goto fail;
2478 }
2479 numberresult = numberresults;
2480 }
2481
2482 /* step 3: format numbers and figure out how large a buffer we need */
2483 for (f = format; *f; f++) {
2484 if (*f == '%') {
2485 const char* p;
2486 int longflag;
2487 int longlongflag;
2488 int size_tflag;
2489 int numprinted;
2490
2491 p = f;
2492 zeropad = (f[1] == '0');
2493 f = parse_format_flags(f, &width, &precision,
2494 &longflag, &longlongflag, &size_tflag);
2495 switch (*f) {
2496 case 'c':
2497 {
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002498 int ordinal = va_arg(count, int);
2499 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2500 PyErr_SetString(PyExc_OverflowError,
2501 "%c arg not in range(0x110000)");
2502 goto fail;
2503 }
2504 maxchar = Py_MAX(maxchar, (Py_UCS4)ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 n++;
2506 break;
2507 }
2508 case '%':
2509 n++;
2510 break;
2511 case 'i':
2512 case 'd':
2513 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2514 width, precision, *f);
2515 if (longflag)
2516 numprinted = sprintf(numberresult, fmt,
2517 va_arg(count, long));
2518#ifdef HAVE_LONG_LONG
2519 else if (longlongflag)
2520 numprinted = sprintf(numberresult, fmt,
2521 va_arg(count, PY_LONG_LONG));
2522#endif
2523 else if (size_tflag)
2524 numprinted = sprintf(numberresult, fmt,
2525 va_arg(count, Py_ssize_t));
2526 else
2527 numprinted = sprintf(numberresult, fmt,
2528 va_arg(count, int));
2529 n += numprinted;
2530 /* advance by +1 to skip over the '\0' */
2531 numberresult += (numprinted + 1);
2532 assert(*(numberresult - 1) == '\0');
2533 assert(*(numberresult - 2) != '\0');
2534 assert(numprinted >= 0);
2535 assert(numberresult <= numberresults + numbersize);
2536 break;
2537 case 'u':
2538 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2539 width, precision, 'u');
2540 if (longflag)
2541 numprinted = sprintf(numberresult, fmt,
2542 va_arg(count, unsigned long));
2543#ifdef HAVE_LONG_LONG
2544 else if (longlongflag)
2545 numprinted = sprintf(numberresult, fmt,
2546 va_arg(count, unsigned PY_LONG_LONG));
2547#endif
2548 else if (size_tflag)
2549 numprinted = sprintf(numberresult, fmt,
2550 va_arg(count, size_t));
2551 else
2552 numprinted = sprintf(numberresult, fmt,
2553 va_arg(count, unsigned int));
2554 n += numprinted;
2555 numberresult += (numprinted + 1);
2556 assert(*(numberresult - 1) == '\0');
2557 assert(*(numberresult - 2) != '\0');
2558 assert(numprinted >= 0);
2559 assert(numberresult <= numberresults + numbersize);
2560 break;
2561 case 'x':
2562 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2563 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2564 n += numprinted;
2565 numberresult += (numprinted + 1);
2566 assert(*(numberresult - 1) == '\0');
2567 assert(*(numberresult - 2) != '\0');
2568 assert(numprinted >= 0);
2569 assert(numberresult <= numberresults + numbersize);
2570 break;
2571 case 'p':
2572 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2573 /* %p is ill-defined: ensure leading 0x. */
2574 if (numberresult[1] == 'X')
2575 numberresult[1] = 'x';
2576 else if (numberresult[1] != 'x') {
2577 memmove(numberresult + 2, numberresult,
2578 strlen(numberresult) + 1);
2579 numberresult[0] = '0';
2580 numberresult[1] = 'x';
2581 numprinted += 2;
2582 }
2583 n += numprinted;
2584 numberresult += (numprinted + 1);
2585 assert(*(numberresult - 1) == '\0');
2586 assert(*(numberresult - 2) != '\0');
2587 assert(numprinted >= 0);
2588 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 break;
2590 case 's':
2591 {
2592 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002593 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002594 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002595 if (!str)
2596 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 /* since PyUnicode_DecodeUTF8 returns already flexible
2598 unicode objects, there is no need to call ready on them */
2599 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002600 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002602 /* Remember the str and switch to the next slot */
2603 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 break;
2605 }
2606 case 'U':
2607 {
2608 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002609 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 if (PyUnicode_READY(obj) == -1)
2611 goto fail;
2612 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002613 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 break;
2616 }
2617 case 'V':
2618 {
2619 PyObject *obj = va_arg(count, PyObject *);
2620 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002623 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002624 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 if (PyUnicode_READY(obj) == -1)
2626 goto fail;
2627 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002628 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002629 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002630 *callresult++ = NULL;
2631 }
2632 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002633 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002634 if (!str_obj)
2635 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002636 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002637 Py_DECREF(str_obj);
2638 goto fail;
2639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002641 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002643 *callresult++ = str_obj;
2644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 break;
2646 }
2647 case 'S':
2648 {
2649 PyObject *obj = va_arg(count, PyObject *);
2650 PyObject *str;
2651 assert(obj);
2652 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002653 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002655 if (PyUnicode_READY(str) == -1) {
2656 Py_DECREF(str);
2657 goto fail;
2658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002660 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 /* Remember the str and switch to the next slot */
2663 *callresult++ = str;
2664 break;
2665 }
2666 case 'R':
2667 {
2668 PyObject *obj = va_arg(count, PyObject *);
2669 PyObject *repr;
2670 assert(obj);
2671 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002672 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002674 if (PyUnicode_READY(repr) == -1) {
2675 Py_DECREF(repr);
2676 goto fail;
2677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002679 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 /* Remember the repr and switch to the next slot */
2682 *callresult++ = repr;
2683 break;
2684 }
2685 case 'A':
2686 {
2687 PyObject *obj = va_arg(count, PyObject *);
2688 PyObject *ascii;
2689 assert(obj);
2690 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002691 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002693 if (PyUnicode_READY(ascii) == -1) {
2694 Py_DECREF(ascii);
2695 goto fail;
2696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002698 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002699 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 /* Remember the repr and switch to the next slot */
2701 *callresult++ = ascii;
2702 break;
2703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 default:
2705 /* if we stumble upon an unknown
2706 formatting code, copy the rest of
2707 the format string to the output
2708 string. (we cannot just skip the
2709 code, since there's no way to know
2710 what's in the argument list) */
2711 n += strlen(p);
2712 goto expand;
2713 }
2714 } else
2715 n++;
2716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 we don't have to resize the string.
2721 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002722 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002723 if (!string)
2724 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 kind = PyUnicode_KIND(string);
2726 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002732 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002733
2734 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2736 /* checking for == because the last argument could be a empty
2737 string, which causes i to point to end, the assert at the end of
2738 the loop */
2739 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 switch (*f) {
2742 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002743 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 const int ordinal = va_arg(vargs, int);
2745 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002747 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002748 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002750 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002753 {
Victor Stinner184252a2012-06-16 02:57:41 +02002754 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 /* unused, since we already have the result */
2756 if (*f == 'p')
2757 (void) va_arg(vargs, void *);
2758 else
2759 (void) va_arg(vargs, int);
2760 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002761 len = strlen(numberresult);
2762 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002764 i += len;
2765 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 assert(*numberresult == '\0');
2767 numberresult++;
2768 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002771 case 's':
2772 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002773 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002775 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 size = PyUnicode_GET_LENGTH(*callresult);
2777 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002778 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002780 /* We're done with the unicode()/repr() => forget it */
2781 Py_DECREF(*callresult);
2782 /* switch to next unicode()/repr() result */
2783 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 break;
2785 }
2786 case 'U':
2787 {
2788 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 Py_ssize_t size;
2790 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2791 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002792 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 break;
2795 }
2796 case 'V':
2797 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002800 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002801 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 size = PyUnicode_GET_LENGTH(obj);
2803 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002804 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 size = PyUnicode_GET_LENGTH(*callresult);
2808 assert(PyUnicode_KIND(*callresult) <=
2809 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002810 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002812 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002814 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 break;
2816 }
2817 case 'S':
2818 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002819 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002820 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002821 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002822 /* unused, since we already have the result */
2823 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002825 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002826 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 /* We're done with the unicode()/repr() => forget it */
2828 Py_DECREF(*callresult);
2829 /* switch to next unicode()/repr() result */
2830 ++callresult;
2831 break;
2832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002833 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002835 break;
2836 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002837 {
2838 Py_ssize_t len = strlen(p);
2839 unicode_write_cstr(string, i, p, len);
2840 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002842 goto end;
2843 }
Victor Stinner184252a2012-06-16 02:57:41 +02002844 }
Victor Stinner1205f272010-09-11 00:54:47 +00002845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002846 else {
2847 assert(i < PyUnicode_GET_LENGTH(string));
2848 PyUnicode_WRITE(kind, data, i++, *f);
2849 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002852
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 if (callresults)
2855 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002856 if (numberresults)
2857 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002858 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002860 if (callresults) {
2861 PyObject **callresult2 = callresults;
2862 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002863 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 ++callresult2;
2865 }
2866 PyObject_Free(callresults);
2867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 if (numberresults)
2869 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871}
2872
Walter Dörwaldd2034312007-05-18 16:29:38 +00002873PyObject *
2874PyUnicode_FromFormat(const char *format, ...)
2875{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002876 PyObject* ret;
2877 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002878
2879#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002880 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002882 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002883#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002884 ret = PyUnicode_FromFormatV(format, vargs);
2885 va_end(vargs);
2886 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002887}
2888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002889#ifdef HAVE_WCHAR_H
2890
Victor Stinner5593d8a2010-10-02 11:11:27 +00002891/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2892 convert a Unicode object to a wide character string.
2893
Victor Stinnerd88d9832011-09-06 02:00:05 +02002894 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002895 character) required to convert the unicode object. Ignore size argument.
2896
Victor Stinnerd88d9832011-09-06 02:00:05 +02002897 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002898 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002899 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002900static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002901unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002902 wchar_t *w,
2903 Py_ssize_t size)
2904{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002905 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002906 const wchar_t *wstr;
2907
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002908 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002909 if (wstr == NULL)
2910 return -1;
2911
Victor Stinner5593d8a2010-10-02 11:11:27 +00002912 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002913 if (size > res)
2914 size = res + 1;
2915 else
2916 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002918 return res;
2919 }
2920 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002921 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002922}
2923
2924Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002925PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002926 wchar_t *w,
2927 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928{
2929 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002930 PyErr_BadInternalCall();
2931 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002933 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934}
2935
Victor Stinner137c34c2010-09-29 10:25:54 +00002936wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002937PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002938 Py_ssize_t *size)
2939{
2940 wchar_t* buffer;
2941 Py_ssize_t buflen;
2942
2943 if (unicode == NULL) {
2944 PyErr_BadInternalCall();
2945 return NULL;
2946 }
2947
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002948 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002949 if (buflen == -1)
2950 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002951 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002952 PyErr_NoMemory();
2953 return NULL;
2954 }
2955
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2957 if (buffer == NULL) {
2958 PyErr_NoMemory();
2959 return NULL;
2960 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002961 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002962 if (buflen == -1) {
2963 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002965 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (size != NULL)
2967 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002968 return buffer;
2969}
2970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972
Alexander Belopolsky40018472011-02-26 01:02:56 +00002973PyObject *
2974PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002977 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 PyErr_SetString(PyExc_ValueError,
2979 "chr() arg not in range(0x110000)");
2980 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002981 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002982
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002983 if ((Py_UCS4)ordinal < 256)
2984 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002986 v = PyUnicode_New(1, ordinal);
2987 if (v == NULL)
2988 return NULL;
2989 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002990 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002991 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002997 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002998 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002999 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003000 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003001 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 Py_INCREF(obj);
3003 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003004 }
3005 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003006 /* For a Unicode subtype that's not a Unicode object,
3007 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003008 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003009 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003010 PyErr_Format(PyExc_TypeError,
3011 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003012 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003013 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003014}
3015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003018 const char *encoding,
3019 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003020{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003021 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003022 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003023
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 PyErr_BadInternalCall();
3026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003028
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003029 /* Decoding bytes objects is the most common case and should be fast */
3030 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003031 if (PyBytes_GET_SIZE(obj) == 0)
3032 _Py_RETURN_UNICODE_EMPTY();
3033 v = PyUnicode_Decode(
3034 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3035 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003036 return v;
3037 }
3038
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 PyErr_SetString(PyExc_TypeError,
3041 "decoding str is not supported");
3042 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003043 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003045 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3046 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3047 PyErr_Format(PyExc_TypeError,
3048 "coercing to str: need bytes, bytearray "
3049 "or buffer-like object, %.80s found",
3050 Py_TYPE(obj)->tp_name);
3051 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003052 }
Tim Petersced69f82003-09-16 20:30:58 +00003053
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003055 PyBuffer_Release(&buffer);
3056 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003058
Serhiy Storchaka05997252013-01-26 12:14:02 +02003059 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003060 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003061 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062}
3063
Victor Stinner600d3be2010-06-10 12:00:55 +00003064/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003065 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3066 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003067int
3068_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003069 char *lower,
3070 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003072 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003073 char *l;
3074 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003075
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003076 if (encoding == NULL) {
3077 strcpy(lower, "utf-8");
3078 return 1;
3079 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003080 e = encoding;
3081 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003082 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003083 while (*e) {
3084 if (l == l_end)
3085 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003086 if (Py_ISUPPER(*e)) {
3087 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003088 }
3089 else if (*e == '_') {
3090 *l++ = '-';
3091 e++;
3092 }
3093 else {
3094 *l++ = *e++;
3095 }
3096 }
3097 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003098 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003099}
3100
Alexander Belopolsky40018472011-02-26 01:02:56 +00003101PyObject *
3102PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003103 Py_ssize_t size,
3104 const char *encoding,
3105 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003106{
3107 PyObject *buffer = NULL, *unicode;
3108 Py_buffer info;
3109 char lower[11]; /* Enough for any encoding shortcut */
3110
Fred Drakee4315f52000-05-09 19:53:39 +00003111 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003112 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003113 if ((strcmp(lower, "utf-8") == 0) ||
3114 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003115 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003116 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003117 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003118 (strcmp(lower, "iso-8859-1") == 0))
3119 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003120#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003121 else if (strcmp(lower, "mbcs") == 0)
3122 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003123#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003124 else if (strcmp(lower, "ascii") == 0)
3125 return PyUnicode_DecodeASCII(s, size, errors);
3126 else if (strcmp(lower, "utf-16") == 0)
3127 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3128 else if (strcmp(lower, "utf-32") == 0)
3129 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131
3132 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003133 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003134 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003135 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003136 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137 if (buffer == NULL)
3138 goto onError;
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02003139 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 if (unicode == NULL)
3141 goto onError;
3142 if (!PyUnicode_Check(unicode)) {
3143 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003144 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003145 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 Py_DECREF(unicode);
3147 goto onError;
3148 }
3149 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003150 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153 Py_XDECREF(buffer);
3154 return NULL;
3155}
3156
Alexander Belopolsky40018472011-02-26 01:02:56 +00003157PyObject *
3158PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003159 const char *encoding,
3160 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003161{
3162 PyObject *v;
3163
3164 if (!PyUnicode_Check(unicode)) {
3165 PyErr_BadArgument();
3166 goto onError;
3167 }
3168
3169 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003171
3172 /* Decode via the codec registry */
3173 v = PyCodec_Decode(unicode, encoding, errors);
3174 if (v == NULL)
3175 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003176 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003177
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003179 return NULL;
3180}
3181
Alexander Belopolsky40018472011-02-26 01:02:56 +00003182PyObject *
3183PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003184 const char *encoding,
3185 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186{
3187 PyObject *v;
3188
3189 if (!PyUnicode_Check(unicode)) {
3190 PyErr_BadArgument();
3191 goto onError;
3192 }
3193
3194 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196
3197 /* Decode via the codec registry */
3198 v = PyCodec_Decode(unicode, encoding, errors);
3199 if (v == NULL)
3200 goto onError;
3201 if (!PyUnicode_Check(v)) {
3202 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003203 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204 Py_TYPE(v)->tp_name);
3205 Py_DECREF(v);
3206 goto onError;
3207 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003208 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003209
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003211 return NULL;
3212}
3213
Alexander Belopolsky40018472011-02-26 01:02:56 +00003214PyObject *
3215PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003216 Py_ssize_t size,
3217 const char *encoding,
3218 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219{
3220 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003221
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222 unicode = PyUnicode_FromUnicode(s, size);
3223 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3226 Py_DECREF(unicode);
3227 return v;
3228}
3229
Alexander Belopolsky40018472011-02-26 01:02:56 +00003230PyObject *
3231PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003232 const char *encoding,
3233 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003234{
3235 PyObject *v;
3236
3237 if (!PyUnicode_Check(unicode)) {
3238 PyErr_BadArgument();
3239 goto onError;
3240 }
3241
3242 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003244
3245 /* Encode via the codec registry */
3246 v = PyCodec_Encode(unicode, encoding, errors);
3247 if (v == NULL)
3248 goto onError;
3249 return v;
3250
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003252 return NULL;
3253}
3254
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255static size_t
3256wcstombs_errorpos(const wchar_t *wstr)
3257{
3258 size_t len;
3259#if SIZEOF_WCHAR_T == 2
3260 wchar_t buf[3];
3261#else
3262 wchar_t buf[2];
3263#endif
3264 char outbuf[MB_LEN_MAX];
3265 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003266
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003267#if SIZEOF_WCHAR_T == 2
3268 buf[2] = 0;
3269#else
3270 buf[1] = 0;
3271#endif
3272 start = wstr;
3273 while (*wstr != L'\0')
3274 {
3275 previous = wstr;
3276#if SIZEOF_WCHAR_T == 2
3277 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3278 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3279 {
3280 buf[0] = wstr[0];
3281 buf[1] = wstr[1];
3282 wstr += 2;
3283 }
3284 else {
3285 buf[0] = *wstr;
3286 buf[1] = 0;
3287 wstr++;
3288 }
3289#else
3290 buf[0] = *wstr;
3291 wstr++;
3292#endif
3293 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003294 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003295 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003296 }
3297
3298 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003299 return 0;
3300}
3301
Victor Stinner1b579672011-12-17 05:47:23 +01003302static int
3303locale_error_handler(const char *errors, int *surrogateescape)
3304{
3305 if (errors == NULL) {
3306 *surrogateescape = 0;
3307 return 0;
3308 }
3309
3310 if (strcmp(errors, "strict") == 0) {
3311 *surrogateescape = 0;
3312 return 0;
3313 }
3314 if (strcmp(errors, "surrogateescape") == 0) {
3315 *surrogateescape = 1;
3316 return 0;
3317 }
3318 PyErr_Format(PyExc_ValueError,
3319 "only 'strict' and 'surrogateescape' error handlers "
3320 "are supported, not '%s'",
3321 errors);
3322 return -1;
3323}
3324
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003325PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003326PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003327{
3328 Py_ssize_t wlen, wlen2;
3329 wchar_t *wstr;
3330 PyObject *bytes = NULL;
3331 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003332 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333 PyObject *exc;
3334 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003335 int surrogateescape;
3336
3337 if (locale_error_handler(errors, &surrogateescape) < 0)
3338 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003339
3340 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3341 if (wstr == NULL)
3342 return NULL;
3343
3344 wlen2 = wcslen(wstr);
3345 if (wlen2 != wlen) {
3346 PyMem_Free(wstr);
3347 PyErr_SetString(PyExc_TypeError, "embedded null character");
3348 return NULL;
3349 }
3350
3351 if (surrogateescape) {
3352 /* locale encoding with surrogateescape */
3353 char *str;
3354
3355 str = _Py_wchar2char(wstr, &error_pos);
3356 if (str == NULL) {
3357 if (error_pos == (size_t)-1) {
3358 PyErr_NoMemory();
3359 PyMem_Free(wstr);
3360 return NULL;
3361 }
3362 else {
3363 goto encode_error;
3364 }
3365 }
3366 PyMem_Free(wstr);
3367
3368 bytes = PyBytes_FromString(str);
3369 PyMem_Free(str);
3370 }
3371 else {
3372 size_t len, len2;
3373
3374 len = wcstombs(NULL, wstr, 0);
3375 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003376 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003377 goto encode_error;
3378 }
3379
3380 bytes = PyBytes_FromStringAndSize(NULL, len);
3381 if (bytes == NULL) {
3382 PyMem_Free(wstr);
3383 return NULL;
3384 }
3385
3386 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3387 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003388 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003389 goto encode_error;
3390 }
3391 PyMem_Free(wstr);
3392 }
3393 return bytes;
3394
3395encode_error:
3396 errmsg = strerror(errno);
3397 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003398
3399 if (error_pos == (size_t)-1)
3400 error_pos = wcstombs_errorpos(wstr);
3401
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003402 PyMem_Free(wstr);
3403 Py_XDECREF(bytes);
3404
Victor Stinner2f197072011-12-17 07:08:30 +01003405 if (errmsg != NULL) {
3406 size_t errlen;
3407 wstr = _Py_char2wchar(errmsg, &errlen);
3408 if (wstr != NULL) {
3409 reason = PyUnicode_FromWideChar(wstr, errlen);
3410 PyMem_Free(wstr);
3411 } else
3412 errmsg = NULL;
3413 }
3414 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003415 reason = PyUnicode_FromString(
3416 "wcstombs() encountered an unencodable "
3417 "wide character");
3418 if (reason == NULL)
3419 return NULL;
3420
3421 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3422 "locale", unicode,
3423 (Py_ssize_t)error_pos,
3424 (Py_ssize_t)(error_pos+1),
3425 reason);
3426 Py_DECREF(reason);
3427 if (exc != NULL) {
3428 PyCodec_StrictErrors(exc);
3429 Py_XDECREF(exc);
3430 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003431 return NULL;
3432}
3433
Victor Stinnerad158722010-10-27 00:25:46 +00003434PyObject *
3435PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003436{
Victor Stinner99b95382011-07-04 14:23:54 +02003437#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003438 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003439#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003441#else
Victor Stinner793b5312011-04-27 00:24:21 +02003442 PyInterpreterState *interp = PyThreadState_GET()->interp;
3443 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3444 cannot use it to encode and decode filenames before it is loaded. Load
3445 the Python codec requires to encode at least its own filename. Use the C
3446 version of the locale codec until the codec registry is initialized and
3447 the Python codec is loaded.
3448
3449 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3450 cannot only rely on it: check also interp->fscodec_initialized for
3451 subinterpreters. */
3452 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003453 return PyUnicode_AsEncodedString(unicode,
3454 Py_FileSystemDefaultEncoding,
3455 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003456 }
3457 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003458 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003459 }
Victor Stinnerad158722010-10-27 00:25:46 +00003460#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003461}
3462
Alexander Belopolsky40018472011-02-26 01:02:56 +00003463PyObject *
3464PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003465 const char *encoding,
3466 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467{
3468 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003469 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003470
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471 if (!PyUnicode_Check(unicode)) {
3472 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474 }
Fred Drakee4315f52000-05-09 19:53:39 +00003475
Fred Drakee4315f52000-05-09 19:53:39 +00003476 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003477 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003478 if ((strcmp(lower, "utf-8") == 0) ||
3479 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003480 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003481 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003483 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003485 }
Victor Stinner37296e82010-06-10 13:36:23 +00003486 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003487 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003488 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003489 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003490#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003491 else if (strcmp(lower, "mbcs") == 0)
3492 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003493#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003494 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497
3498 /* Encode via the codec registry */
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02003499 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003501 return NULL;
3502
3503 /* The normal path */
3504 if (PyBytes_Check(v))
3505 return v;
3506
3507 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003508 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003509 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003510 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003511
3512 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3513 "encoder %s returned bytearray instead of bytes",
3514 encoding);
3515 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003516 Py_DECREF(v);
3517 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003518 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003519
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003520 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3521 Py_DECREF(v);
3522 return b;
3523 }
3524
3525 PyErr_Format(PyExc_TypeError,
3526 "encoder did not return a bytes object (type=%.400s)",
3527 Py_TYPE(v)->tp_name);
3528 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003529 return NULL;
3530}
3531
Alexander Belopolsky40018472011-02-26 01:02:56 +00003532PyObject *
3533PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003534 const char *encoding,
3535 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003536{
3537 PyObject *v;
3538
3539 if (!PyUnicode_Check(unicode)) {
3540 PyErr_BadArgument();
3541 goto onError;
3542 }
3543
3544 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003546
3547 /* Encode via the codec registry */
3548 v = PyCodec_Encode(unicode, encoding, errors);
3549 if (v == NULL)
3550 goto onError;
3551 if (!PyUnicode_Check(v)) {
3552 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003553 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003554 Py_TYPE(v)->tp_name);
3555 Py_DECREF(v);
3556 goto onError;
3557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003559
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 return NULL;
3562}
3563
Victor Stinner2f197072011-12-17 07:08:30 +01003564static size_t
3565mbstowcs_errorpos(const char *str, size_t len)
3566{
3567#ifdef HAVE_MBRTOWC
3568 const char *start = str;
3569 mbstate_t mbs;
3570 size_t converted;
3571 wchar_t ch;
3572
3573 memset(&mbs, 0, sizeof mbs);
3574 while (len)
3575 {
3576 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3577 if (converted == 0)
3578 /* Reached end of string */
3579 break;
3580 if (converted == (size_t)-1 || converted == (size_t)-2) {
3581 /* Conversion error or incomplete character */
3582 return str - start;
3583 }
3584 else {
3585 str += converted;
3586 len -= converted;
3587 }
3588 }
3589 /* failed to find the undecodable byte sequence */
3590 return 0;
3591#endif
3592 return 0;
3593}
3594
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003595PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003596PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003597 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003598{
3599 wchar_t smallbuf[256];
3600 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3601 wchar_t *wstr;
3602 size_t wlen, wlen2;
3603 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003604 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003605 size_t error_pos;
3606 char *errmsg;
3607 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003608
3609 if (locale_error_handler(errors, &surrogateescape) < 0)
3610 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003611
3612 if (str[len] != '\0' || len != strlen(str)) {
3613 PyErr_SetString(PyExc_TypeError, "embedded null character");
3614 return NULL;
3615 }
3616
3617 if (surrogateescape)
3618 {
3619 wstr = _Py_char2wchar(str, &wlen);
3620 if (wstr == NULL) {
3621 if (wlen == (size_t)-1)
3622 PyErr_NoMemory();
3623 else
3624 PyErr_SetFromErrno(PyExc_OSError);
3625 return NULL;
3626 }
3627
3628 unicode = PyUnicode_FromWideChar(wstr, wlen);
3629 PyMem_Free(wstr);
3630 }
3631 else {
3632#ifndef HAVE_BROKEN_MBSTOWCS
3633 wlen = mbstowcs(NULL, str, 0);
3634#else
3635 wlen = len;
3636#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003637 if (wlen == (size_t)-1)
3638 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003639 if (wlen+1 <= smallbuf_len) {
3640 wstr = smallbuf;
3641 }
3642 else {
3643 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3644 return PyErr_NoMemory();
3645
3646 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3647 if (!wstr)
3648 return PyErr_NoMemory();
3649 }
3650
3651 /* This shouldn't fail now */
3652 wlen2 = mbstowcs(wstr, str, wlen+1);
3653 if (wlen2 == (size_t)-1) {
3654 if (wstr != smallbuf)
3655 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003656 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003657 }
3658#ifdef HAVE_BROKEN_MBSTOWCS
3659 assert(wlen2 == wlen);
3660#endif
3661 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3662 if (wstr != smallbuf)
3663 PyMem_Free(wstr);
3664 }
3665 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003666
3667decode_error:
3668 errmsg = strerror(errno);
3669 assert(errmsg != NULL);
3670
3671 error_pos = mbstowcs_errorpos(str, len);
3672 if (errmsg != NULL) {
3673 size_t errlen;
3674 wstr = _Py_char2wchar(errmsg, &errlen);
3675 if (wstr != NULL) {
3676 reason = PyUnicode_FromWideChar(wstr, errlen);
3677 PyMem_Free(wstr);
3678 } else
3679 errmsg = NULL;
3680 }
3681 if (errmsg == NULL)
3682 reason = PyUnicode_FromString(
3683 "mbstowcs() encountered an invalid multibyte sequence");
3684 if (reason == NULL)
3685 return NULL;
3686
3687 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3688 "locale", str, len,
3689 (Py_ssize_t)error_pos,
3690 (Py_ssize_t)(error_pos+1),
3691 reason);
3692 Py_DECREF(reason);
3693 if (exc != NULL) {
3694 PyCodec_StrictErrors(exc);
3695 Py_XDECREF(exc);
3696 }
3697 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698}
3699
3700PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003701PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003702{
3703 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003704 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003705}
3706
3707
3708PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003709PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003710 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003711 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3712}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003713
Christian Heimes5894ba72007-11-04 11:43:14 +00003714PyObject*
3715PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3716{
Victor Stinner99b95382011-07-04 14:23:54 +02003717#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003718 return PyUnicode_DecodeMBCS(s, size, NULL);
3719#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003720 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003721#else
Victor Stinner793b5312011-04-27 00:24:21 +02003722 PyInterpreterState *interp = PyThreadState_GET()->interp;
3723 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3724 cannot use it to encode and decode filenames before it is loaded. Load
3725 the Python codec requires to encode at least its own filename. Use the C
3726 version of the locale codec until the codec registry is initialized and
3727 the Python codec is loaded.
3728
3729 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3730 cannot only rely on it: check also interp->fscodec_initialized for
3731 subinterpreters. */
3732 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003733 return PyUnicode_Decode(s, size,
3734 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003735 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003736 }
3737 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003738 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003739 }
Victor Stinnerad158722010-10-27 00:25:46 +00003740#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003741}
3742
Martin v. Löwis011e8422009-05-05 04:43:17 +00003743
3744int
Antoine Pitrou13348842012-01-29 18:36:34 +01003745_PyUnicode_HasNULChars(PyObject* s)
3746{
3747 static PyObject *nul = NULL;
3748
3749 if (nul == NULL)
3750 nul = PyUnicode_FromStringAndSize("\0", 1);
3751 if (nul == NULL)
3752 return -1;
3753 return PyUnicode_Contains(s, nul);
3754}
3755
3756
3757int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758PyUnicode_FSConverter(PyObject* arg, void* addr)
3759{
3760 PyObject *output = NULL;
3761 Py_ssize_t size;
3762 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003763 if (arg == NULL) {
3764 Py_DECREF(*(PyObject**)addr);
3765 return 1;
3766 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003767 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003768 output = arg;
3769 Py_INCREF(output);
3770 }
3771 else {
3772 arg = PyUnicode_FromObject(arg);
3773 if (!arg)
3774 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003775 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003776 Py_DECREF(arg);
3777 if (!output)
3778 return 0;
3779 if (!PyBytes_Check(output)) {
3780 Py_DECREF(output);
3781 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3782 return 0;
3783 }
3784 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003785 size = PyBytes_GET_SIZE(output);
3786 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003787 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003788 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003789 Py_DECREF(output);
3790 return 0;
3791 }
3792 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003793 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003794}
3795
3796
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003797int
3798PyUnicode_FSDecoder(PyObject* arg, void* addr)
3799{
3800 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003801 if (arg == NULL) {
3802 Py_DECREF(*(PyObject**)addr);
3803 return 1;
3804 }
3805 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003806 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003808 output = arg;
3809 Py_INCREF(output);
3810 }
3811 else {
3812 arg = PyBytes_FromObject(arg);
3813 if (!arg)
3814 return 0;
3815 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3816 PyBytes_GET_SIZE(arg));
3817 Py_DECREF(arg);
3818 if (!output)
3819 return 0;
3820 if (!PyUnicode_Check(output)) {
3821 Py_DECREF(output);
3822 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3823 return 0;
3824 }
3825 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003826 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003827 Py_DECREF(output);
3828 return 0;
3829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003831 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003832 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3833 Py_DECREF(output);
3834 return 0;
3835 }
3836 *(PyObject**)addr = output;
3837 return Py_CLEANUP_SUPPORTED;
3838}
3839
3840
Martin v. Löwis5b222132007-06-10 09:51:05 +00003841char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003843{
Christian Heimesf3863112007-11-22 07:46:41 +00003844 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003846 if (!PyUnicode_Check(unicode)) {
3847 PyErr_BadArgument();
3848 return NULL;
3849 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003851 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003853 if (PyUnicode_UTF8(unicode) == NULL) {
3854 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3856 if (bytes == NULL)
3857 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003858 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3859 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 Py_DECREF(bytes);
3861 return NULL;
3862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003863 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3864 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3865 PyBytes_AS_STRING(bytes),
3866 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 Py_DECREF(bytes);
3868 }
3869
3870 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003871 *psize = PyUnicode_UTF8_LENGTH(unicode);
3872 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003873}
3874
3875char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003878 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3879}
3880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881Py_UNICODE *
3882PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 const unsigned char *one_byte;
3885#if SIZEOF_WCHAR_T == 4
3886 const Py_UCS2 *two_bytes;
3887#else
3888 const Py_UCS4 *four_bytes;
3889 const Py_UCS4 *ucs4_end;
3890 Py_ssize_t num_surrogates;
3891#endif
3892 wchar_t *w;
3893 wchar_t *wchar_end;
3894
3895 if (!PyUnicode_Check(unicode)) {
3896 PyErr_BadArgument();
3897 return NULL;
3898 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 assert(_PyUnicode_KIND(unicode) != 0);
3902 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003904 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003906 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3907 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 num_surrogates = 0;
3909
3910 for (; four_bytes < ucs4_end; ++four_bytes) {
3911 if (*four_bytes > 0xFFFF)
3912 ++num_surrogates;
3913 }
3914
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003915 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3916 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3917 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 PyErr_NoMemory();
3919 return NULL;
3920 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003923 w = _PyUnicode_WSTR(unicode);
3924 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3925 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3927 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003928 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003930 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3931 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932 }
3933 else
3934 *w = *four_bytes;
3935
3936 if (w > wchar_end) {
3937 assert(0 && "Miscalculated string end");
3938 }
3939 }
3940 *w = 0;
3941#else
3942 /* sizeof(wchar_t) == 4 */
3943 Py_FatalError("Impossible unicode object state, wstr and str "
3944 "should share memory already.");
3945 return NULL;
3946#endif
3947 }
3948 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003949 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3950 (_PyUnicode_LENGTH(unicode) + 1));
3951 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 PyErr_NoMemory();
3953 return NULL;
3954 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3956 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3957 w = _PyUnicode_WSTR(unicode);
3958 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003960 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3961 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 for (; w < wchar_end; ++one_byte, ++w)
3963 *w = *one_byte;
3964 /* null-terminate the wstr */
3965 *w = 0;
3966 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003967 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003969 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 for (; w < wchar_end; ++two_bytes, ++w)
3971 *w = *two_bytes;
3972 /* null-terminate the wstr */
3973 *w = 0;
3974#else
3975 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003976 PyObject_FREE(_PyUnicode_WSTR(unicode));
3977 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 Py_FatalError("Impossible unicode object state, wstr "
3979 "and str should share memory already.");
3980 return NULL;
3981#endif
3982 }
3983 else {
3984 assert(0 && "This should never happen.");
3985 }
3986 }
3987 }
3988 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 *size = PyUnicode_WSTR_LENGTH(unicode);
3990 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003991}
3992
Alexander Belopolsky40018472011-02-26 01:02:56 +00003993Py_UNICODE *
3994PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997}
3998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999
Alexander Belopolsky40018472011-02-26 01:02:56 +00004000Py_ssize_t
4001PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002{
4003 if (!PyUnicode_Check(unicode)) {
4004 PyErr_BadArgument();
4005 goto onError;
4006 }
4007 return PyUnicode_GET_SIZE(unicode);
4008
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 return -1;
4011}
4012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013Py_ssize_t
4014PyUnicode_GetLength(PyObject *unicode)
4015{
Victor Stinner07621332012-06-16 04:53:46 +02004016 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 PyErr_BadArgument();
4018 return -1;
4019 }
Victor Stinner07621332012-06-16 04:53:46 +02004020 if (PyUnicode_READY(unicode) == -1)
4021 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 return PyUnicode_GET_LENGTH(unicode);
4023}
4024
4025Py_UCS4
4026PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4027{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004028 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4029 PyErr_BadArgument();
4030 return (Py_UCS4)-1;
4031 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004032 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004033 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 return (Py_UCS4)-1;
4035 }
4036 return PyUnicode_READ_CHAR(unicode, index);
4037}
4038
4039int
4040PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4041{
4042 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004043 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 return -1;
4045 }
Victor Stinner488fa492011-12-12 00:01:39 +01004046 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004047 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004048 PyErr_SetString(PyExc_IndexError, "string index out of range");
4049 return -1;
4050 }
Victor Stinner488fa492011-12-12 00:01:39 +01004051 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004052 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004053 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4054 PyErr_SetString(PyExc_ValueError, "character out of range");
4055 return -1;
4056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4058 index, ch);
4059 return 0;
4060}
4061
Alexander Belopolsky40018472011-02-26 01:02:56 +00004062const char *
4063PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004064{
Victor Stinner42cb4622010-09-01 19:39:01 +00004065 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004066}
4067
Victor Stinner554f3f02010-06-16 23:33:54 +00004068/* create or adjust a UnicodeDecodeError */
4069static void
4070make_decode_exception(PyObject **exceptionObject,
4071 const char *encoding,
4072 const char *input, Py_ssize_t length,
4073 Py_ssize_t startpos, Py_ssize_t endpos,
4074 const char *reason)
4075{
4076 if (*exceptionObject == NULL) {
4077 *exceptionObject = PyUnicodeDecodeError_Create(
4078 encoding, input, length, startpos, endpos, reason);
4079 }
4080 else {
4081 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4082 goto onError;
4083 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4084 goto onError;
4085 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4086 goto onError;
4087 }
4088 return;
4089
4090onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004091 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004092}
4093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094/* error handling callback helper:
4095 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004096 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004097 and adjust various state variables.
4098 return 0 on success, -1 on error
4099*/
4100
Alexander Belopolsky40018472011-02-26 01:02:56 +00004101static int
4102unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004103 const char *encoding, const char *reason,
4104 const char **input, const char **inend, Py_ssize_t *startinpos,
4105 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004106 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004108 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109
4110 PyObject *restuple = NULL;
4111 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004112 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004113 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004114 Py_ssize_t requiredsize;
4115 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004116 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 int res = -1;
4118
Victor Stinner596a6c42011-11-09 00:02:18 +01004119 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4120 outsize = PyUnicode_GET_LENGTH(*output);
4121 else
4122 outsize = _PyUnicode_WSTR_LENGTH(*output);
4123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 *errorHandler = PyCodec_LookupError(errors);
4126 if (*errorHandler == NULL)
4127 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 }
4129
Victor Stinner554f3f02010-06-16 23:33:54 +00004130 make_decode_exception(exceptionObject,
4131 encoding,
4132 *input, *inend - *input,
4133 *startinpos, *endinpos,
4134 reason);
4135 if (*exceptionObject == NULL)
4136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137
4138 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4139 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004142 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144 }
4145 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004147 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004148 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004149
4150 /* Copy back the bytes variables, which might have been modified by the
4151 callback */
4152 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4153 if (!inputobj)
4154 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004155 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004158 *input = PyBytes_AS_STRING(inputobj);
4159 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004160 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004161 /* we can DECREF safely, as the exception has another reference,
4162 so the object won't go away. */
4163 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004167 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4169 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004170 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171
Victor Stinner596a6c42011-11-09 00:02:18 +01004172 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4173 /* need more space? (at least enough for what we
4174 have+the replacement+the rest of the string (starting
4175 at the new input position), so we won't have to check space
4176 when there are no errors in the rest of the string) */
4177 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004178 requiredsize = *outpos;
4179 if (requiredsize > PY_SSIZE_T_MAX - replen)
4180 goto overflow;
4181 requiredsize += replen;
4182 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4183 goto overflow;
4184 requiredsize += insize - newpos;
Victor Stinner596a6c42011-11-09 00:02:18 +01004185 if (requiredsize > outsize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004186 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinner596a6c42011-11-09 00:02:18 +01004187 requiredsize = 2*outsize;
4188 if (unicode_resize(output, requiredsize) < 0)
4189 goto onError;
4190 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004191 if (unicode_widen(output, *outpos,
4192 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004194 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004195 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004197 else {
4198 wchar_t *repwstr;
4199 Py_ssize_t repwlen;
4200 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4201 if (repwstr == NULL)
4202 goto onError;
4203 /* need more space? (at least enough for what we
4204 have+the replacement+the rest of the string (starting
4205 at the new input position), so we won't have to check space
4206 when there are no errors in the rest of the string) */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004207 requiredsize = *outpos;
4208 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4209 goto overflow;
4210 requiredsize += repwlen;
4211 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4212 goto overflow;
4213 requiredsize += insize - newpos;
Victor Stinner596a6c42011-11-09 00:02:18 +01004214 if (requiredsize > outsize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004215 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinner596a6c42011-11-09 00:02:18 +01004216 requiredsize = 2*outsize;
4217 if (unicode_resize(output, requiredsize) < 0)
4218 goto onError;
4219 }
4220 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4221 *outpos += repwlen;
4222 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004224 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226 /* we made it! */
4227 res = 0;
4228
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 Py_XDECREF(restuple);
4231 return res;
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004232
4233 overflow:
4234 PyErr_SetString(PyExc_OverflowError,
4235 "decoded result is too long for a Python string");
4236 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237}
4238
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004239/* --- UTF-7 Codec -------------------------------------------------------- */
4240
Antoine Pitrou244651a2009-05-04 18:56:13 +00004241/* See RFC2152 for details. We encode conservatively and decode liberally. */
4242
4243/* Three simple macros defining base-64. */
4244
4245/* Is c a base-64 character? */
4246
4247#define IS_BASE64(c) \
4248 (((c) >= 'A' && (c) <= 'Z') || \
4249 ((c) >= 'a' && (c) <= 'z') || \
4250 ((c) >= '0' && (c) <= '9') || \
4251 (c) == '+' || (c) == '/')
4252
4253/* given that c is a base-64 character, what is its base-64 value? */
4254
4255#define FROM_BASE64(c) \
4256 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4257 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4258 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4259 (c) == '+' ? 62 : 63)
4260
4261/* What is the base-64 character of the bottom 6 bits of n? */
4262
4263#define TO_BASE64(n) \
4264 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4265
4266/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4267 * decoded as itself. We are permissive on decoding; the only ASCII
4268 * byte not decoding to itself is the + which begins a base64
4269 * string. */
4270
4271#define DECODE_DIRECT(c) \
4272 ((c) <= 127 && (c) != '+')
4273
4274/* The UTF-7 encoder treats ASCII characters differently according to
4275 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4276 * the above). See RFC2152. This array identifies these different
4277 * sets:
4278 * 0 : "Set D"
4279 * alphanumeric and '(),-./:?
4280 * 1 : "Set O"
4281 * !"#$%&*;<=>@[]^_`{|}
4282 * 2 : "whitespace"
4283 * ht nl cr sp
4284 * 3 : special (must be base64 encoded)
4285 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4286 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287
Tim Petersced69f82003-09-16 20:30:58 +00004288static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004289char utf7_category[128] = {
4290/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4291 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4292/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4293 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4294/* sp ! " # $ % & ' ( ) * + , - . / */
4295 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4296/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4298/* @ A B C D E F G H I J K L M N O */
4299 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4300/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4302/* ` a b c d e f g h i j k l m n o */
4303 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4304/* p q r s t u v w x y z { | } ~ del */
4305 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306};
4307
Antoine Pitrou244651a2009-05-04 18:56:13 +00004308/* ENCODE_DIRECT: this character should be encoded as itself. The
4309 * answer depends on whether we are encoding set O as itself, and also
4310 * on whether we are encoding whitespace as itself. RFC2152 makes it
4311 * clear that the answers to these questions vary between
4312 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004313
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314#define ENCODE_DIRECT(c, directO, directWS) \
4315 ((c) < 128 && (c) > 0 && \
4316 ((utf7_category[(c)] == 0) || \
4317 (directWS && (utf7_category[(c)] == 2)) || \
4318 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319
Alexander Belopolsky40018472011-02-26 01:02:56 +00004320PyObject *
4321PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004322 Py_ssize_t size,
4323 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004325 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4326}
4327
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328/* The decoder. The only state we preserve is our read position,
4329 * i.e. how many characters we have consumed. So if we end in the
4330 * middle of a shift sequence we have to back off the read position
4331 * and the output to the beginning of the sequence, otherwise we lose
4332 * all the shift state (seen bits, number of bits seen, high
4333 * surrogate). */
4334
Alexander Belopolsky40018472011-02-26 01:02:56 +00004335PyObject *
4336PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004337 Py_ssize_t size,
4338 const char *errors,
4339 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004340{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004341 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004342 Py_ssize_t startinpos;
4343 Py_ssize_t endinpos;
4344 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004346 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347 const char *errmsg = "";
4348 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004349 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 unsigned int base64bits = 0;
4351 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004352 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 PyObject *errorHandler = NULL;
4354 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 /* Start off assuming it's all ASCII. Widen later as necessary. */
4357 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358 if (!unicode)
4359 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004360 if (size == 0) {
4361 if (consumed)
4362 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004363 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367 e = s + size;
4368
4369 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004370 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004372 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (inShift) { /* in a base-64 section */
4375 if (IS_BASE64(ch)) { /* consume a base-64 character */
4376 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4377 base64bits += 6;
4378 s++;
4379 if (base64bits >= 16) {
4380 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004381 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 base64bits -= 16;
4383 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004384 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 if (surrogate) {
4386 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004387 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4388 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004389 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4390 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004392 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 }
4394 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004395 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4396 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 }
4399 }
Victor Stinner551ac952011-11-29 22:58:13 +01004400 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 /* first surrogate */
4402 surrogate = outCh;
4403 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004405 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4406 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 }
4408 }
4409 }
4410 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 inShift = 0;
4412 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004414 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4415 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004416 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 if (base64bits > 0) { /* left-over bits */
4419 if (base64bits >= 6) {
4420 /* We've seen at least one base-64 character */
4421 errmsg = "partial character in shift sequence";
4422 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 else {
4425 /* Some bits remain; they should be zero */
4426 if (base64buffer != 0) {
4427 errmsg = "non-zero padding bits in shift sequence";
4428 goto utf7Error;
4429 }
4430 }
4431 }
4432 if (ch != '-') {
4433 /* '-' is absorbed; other terminating
4434 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004435 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4436 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439 }
4440 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 s++; /* consume '+' */
4443 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004445 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4446 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
4448 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004450 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004452 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453 }
4454 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004456 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4457 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 s++;
4459 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 else {
4461 startinpos = s-starts;
4462 s++;
4463 errmsg = "unexpected special character";
4464 goto utf7Error;
4465 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 endinpos = s-starts;
4469 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 errors, &errorHandler,
4471 "utf7", errmsg,
4472 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475 }
4476
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 /* end of string */
4478
4479 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4480 /* if we're in an inconsistent state, that's an error */
4481 if (surrogate ||
4482 (base64bits >= 6) ||
4483 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 endinpos = size;
4485 if (unicode_decode_call_errorhandler(
4486 errors, &errorHandler,
4487 "utf7", "unterminated shift sequence",
4488 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 goto onError;
4491 if (s < e)
4492 goto restart;
4493 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495
4496 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004497 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004499 *consumed = startinpos;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004500 if (outpos != shiftOutStart &&
4501 PyUnicode_MAX_CHAR_VALUE(unicode) > 127) {
4502 PyObject *result = PyUnicode_FromKindAndData(
4503 PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4504 shiftOutStart);
4505 Py_DECREF(unicode);
4506 unicode = result;
4507 }
4508 outpos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 }
4510 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004511 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004513 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004515 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 goto onError;
4517
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 Py_XDECREF(errorHandler);
4519 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004520 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 Py_XDECREF(errorHandler);
4524 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004525 Py_DECREF(unicode);
4526 return NULL;
4527}
4528
4529
Alexander Belopolsky40018472011-02-26 01:02:56 +00004530PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004531_PyUnicode_EncodeUTF7(PyObject *str,
4532 int base64SetO,
4533 int base64WhiteSpace,
4534 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004536 int kind;
4537 void *data;
4538 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004539 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004541 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 unsigned int base64bits = 0;
4543 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 char * out;
4545 char * start;
4546
Benjamin Petersonbac79492012-01-14 13:34:47 -05004547 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004548 return NULL;
4549 kind = PyUnicode_KIND(str);
4550 data = PyUnicode_DATA(str);
4551 len = PyUnicode_GET_LENGTH(str);
4552
4553 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004556 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004557 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004558 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004559 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560 if (v == NULL)
4561 return NULL;
4562
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004563 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004564 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004565 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 if (inShift) {
4568 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4569 /* shifting out */
4570 if (base64bits) { /* output remaining bits */
4571 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4572 base64buffer = 0;
4573 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004574 }
4575 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 /* Characters not in the BASE64 set implicitly unshift the sequence
4577 so no '-' is required, except if the character is itself a '-' */
4578 if (IS_BASE64(ch) || ch == '-') {
4579 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 *out++ = (char) ch;
4582 }
4583 else {
4584 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004585 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 else { /* not in a shift sequence */
4588 if (ch == '+') {
4589 *out++ = '+';
4590 *out++ = '-';
4591 }
4592 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4593 *out++ = (char) ch;
4594 }
4595 else {
4596 *out++ = '+';
4597 inShift = 1;
4598 goto encode_char;
4599 }
4600 }
4601 continue;
4602encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004604 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 /* code first surrogate */
4607 base64bits += 16;
4608 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4609 while (base64bits >= 6) {
4610 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4611 base64bits -= 6;
4612 }
4613 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004614 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 base64bits += 16;
4617 base64buffer = (base64buffer << 16) | ch;
4618 while (base64bits >= 6) {
4619 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4620 base64bits -= 6;
4621 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004622 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 if (base64bits)
4624 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4625 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004627 if (_PyBytes_Resize(&v, out - start) < 0)
4628 return NULL;
4629 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004631PyObject *
4632PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4633 Py_ssize_t size,
4634 int base64SetO,
4635 int base64WhiteSpace,
4636 const char *errors)
4637{
4638 PyObject *result;
4639 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4640 if (tmp == NULL)
4641 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004642 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004643 base64WhiteSpace, errors);
4644 Py_DECREF(tmp);
4645 return result;
4646}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648#undef IS_BASE64
4649#undef FROM_BASE64
4650#undef TO_BASE64
4651#undef DECODE_DIRECT
4652#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654/* --- UTF-8 Codec -------------------------------------------------------- */
4655
Alexander Belopolsky40018472011-02-26 01:02:56 +00004656PyObject *
4657PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004658 Py_ssize_t size,
4659 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660{
Walter Dörwald69652032004-09-07 20:24:22 +00004661 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4662}
4663
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664#include "stringlib/asciilib.h"
4665#include "stringlib/codecs.h"
4666#include "stringlib/undef.h"
4667
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004668#include "stringlib/ucs1lib.h"
4669#include "stringlib/codecs.h"
4670#include "stringlib/undef.h"
4671
4672#include "stringlib/ucs2lib.h"
4673#include "stringlib/codecs.h"
4674#include "stringlib/undef.h"
4675
4676#include "stringlib/ucs4lib.h"
4677#include "stringlib/codecs.h"
4678#include "stringlib/undef.h"
4679
Antoine Pitrouab868312009-01-10 15:40:25 +00004680/* Mask to quickly check whether a C 'long' contains a
4681 non-ASCII, UTF8-encoded char. */
4682#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004683# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004684#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004685# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004686#else
4687# error C 'long' size should be either 4 or 8!
4688#endif
4689
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004690static Py_ssize_t
4691ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004694 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004695
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004696 /*
4697 * Issue #17237: m68k is a bit different from most architectures in
4698 * that objects do not use "natural alignment" - for example, int and
4699 * long are only aligned at 2-byte boundaries. Therefore the assert()
4700 * won't work; also, tests have shown that skipping the "optimised
4701 * version" will even speed up m68k.
4702 */
4703#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004704#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004705 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4706 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004707 /* Fast path, see in STRINGLIB(utf8_decode) for
4708 an explanation. */
4709 /* Help register allocation */
4710 register const char *_p = p;
4711 register Py_UCS1 * q = dest;
4712 while (_p < aligned_end) {
4713 unsigned long value = *(const unsigned long *) _p;
4714 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 *((unsigned long *)q) = value;
4717 _p += SIZEOF_LONG;
4718 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004719 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 p = _p;
4721 while (p < end) {
4722 if ((unsigned char)*p & 0x80)
4723 break;
4724 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004729#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 while (p < end) {
4731 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4732 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004733 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* Help register allocation */
4735 register const char *_p = p;
4736 while (_p < aligned_end) {
4737 unsigned long value = *(unsigned long *) _p;
4738 if (value & ASCII_CHAR_MASK)
4739 break;
4740 _p += SIZEOF_LONG;
4741 }
4742 p = _p;
4743 if (_p == end)
4744 break;
4745 }
4746 if ((unsigned char)*p & 0x80)
4747 break;
4748 ++p;
4749 }
4750 memcpy(dest, start, p - start);
4751 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752}
Antoine Pitrouab868312009-01-10 15:40:25 +00004753
Victor Stinner785938e2011-12-11 20:09:03 +01004754PyObject *
4755PyUnicode_DecodeUTF8Stateful(const char *s,
4756 Py_ssize_t size,
4757 const char *errors,
4758 Py_ssize_t *consumed)
4759{
Victor Stinner785938e2011-12-11 20:09:03 +01004760 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004761 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762 const char *end = s + size;
4763 Py_ssize_t outpos;
4764
4765 Py_ssize_t startinpos;
4766 Py_ssize_t endinpos;
4767 const char *errmsg = "";
4768 PyObject *errorHandler = NULL;
4769 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004770
4771 if (size == 0) {
4772 if (consumed)
4773 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004774 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004775 }
4776
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4778 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004779 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 *consumed = 1;
4781 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004782 }
4783
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004785 if (!unicode)
4786 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004787
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4789 s += outpos;
4790 while (s < end) {
4791 Py_UCS4 ch;
4792 int kind = PyUnicode_KIND(unicode);
4793 if (kind == PyUnicode_1BYTE_KIND) {
4794 if (PyUnicode_IS_ASCII(unicode))
4795 ch = asciilib_utf8_decode(&s, end,
4796 PyUnicode_1BYTE_DATA(unicode), &outpos);
4797 else
4798 ch = ucs1lib_utf8_decode(&s, end,
4799 PyUnicode_1BYTE_DATA(unicode), &outpos);
4800 } else if (kind == PyUnicode_2BYTE_KIND) {
4801 ch = ucs2lib_utf8_decode(&s, end,
4802 PyUnicode_2BYTE_DATA(unicode), &outpos);
4803 } else {
4804 assert(kind == PyUnicode_4BYTE_KIND);
4805 ch = ucs4lib_utf8_decode(&s, end,
4806 PyUnicode_4BYTE_DATA(unicode), &outpos);
4807 }
4808
4809 switch (ch) {
4810 case 0:
4811 if (s == end || consumed)
4812 goto End;
4813 errmsg = "unexpected end of data";
4814 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004815 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 break;
4817 case 1:
4818 errmsg = "invalid start byte";
4819 startinpos = s - starts;
4820 endinpos = startinpos + 1;
4821 break;
4822 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004823 case 3:
4824 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825 errmsg = "invalid continuation byte";
4826 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004827 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 break;
4829 default:
4830 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4831 goto onError;
4832 continue;
4833 }
4834
4835 if (unicode_decode_call_errorhandler(
4836 errors, &errorHandler,
4837 "utf-8", errmsg,
4838 &starts, &end, &startinpos, &endinpos, &exc, &s,
4839 &unicode, &outpos))
4840 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004841 }
4842
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843End:
4844 if (unicode_resize(&unicode, outpos) < 0)
4845 goto onError;
4846
4847 if (consumed)
4848 *consumed = s - starts;
4849
4850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
4852 assert(_PyUnicode_CheckConsistency(unicode, 1));
4853 return unicode;
4854
4855onError:
4856 Py_XDECREF(errorHandler);
4857 Py_XDECREF(exc);
4858 Py_XDECREF(unicode);
4859 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004860}
4861
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004862#ifdef __APPLE__
4863
4864/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004865 used to decode the command line arguments on Mac OS X.
4866
4867 Return a pointer to a newly allocated wide character string (use
4868 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004869
4870wchar_t*
4871_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4872{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004873 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874 wchar_t *unicode;
4875 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004876
4877 /* Note: size will always be longer than the resulting Unicode
4878 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004879 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004880 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004881 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4882 if (!unicode)
4883 return NULL;
4884
4885 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004886 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004888 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004890#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004892#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004894#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 if (ch > 0xFF) {
4896#if SIZEOF_WCHAR_T == 4
4897 assert(0);
4898#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004899 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900 /* compute and append the two surrogates: */
4901 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4902 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4903#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004904 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 else {
4906 if (!ch && s == e)
4907 break;
4908 /* surrogateescape */
4909 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4910 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004911 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004913 return unicode;
4914}
4915
4916#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004918/* Primary internal function which creates utf8 encoded bytes objects.
4919
4920 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004921 and allocate exactly as much space needed at the end. Else allocate the
4922 maximum possible needed (4 result bytes per Unicode character), and return
4923 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004924*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004925PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004926_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927{
Victor Stinner6099a032011-12-18 14:22:26 +01004928 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004929 void *data;
4930 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932 if (!PyUnicode_Check(unicode)) {
4933 PyErr_BadArgument();
4934 return NULL;
4935 }
4936
4937 if (PyUnicode_READY(unicode) == -1)
4938 return NULL;
4939
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004940 if (PyUnicode_UTF8(unicode))
4941 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4942 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004943
4944 kind = PyUnicode_KIND(unicode);
4945 data = PyUnicode_DATA(unicode);
4946 size = PyUnicode_GET_LENGTH(unicode);
4947
Benjamin Petersonead6b532011-12-20 17:23:42 -06004948 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004949 default:
4950 assert(0);
4951 case PyUnicode_1BYTE_KIND:
4952 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4953 assert(!PyUnicode_IS_ASCII(unicode));
4954 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4955 case PyUnicode_2BYTE_KIND:
4956 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4957 case PyUnicode_4BYTE_KIND:
4958 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960}
4961
Alexander Belopolsky40018472011-02-26 01:02:56 +00004962PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004963PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4964 Py_ssize_t size,
4965 const char *errors)
4966{
4967 PyObject *v, *unicode;
4968
4969 unicode = PyUnicode_FromUnicode(s, size);
4970 if (unicode == NULL)
4971 return NULL;
4972 v = _PyUnicode_AsUTF8String(unicode, errors);
4973 Py_DECREF(unicode);
4974 return v;
4975}
4976
4977PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004978PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004980 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981}
4982
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983/* --- UTF-32 Codec ------------------------------------------------------- */
4984
4985PyObject *
4986PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 Py_ssize_t size,
4988 const char *errors,
4989 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990{
4991 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4992}
4993
4994PyObject *
4995PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 Py_ssize_t size,
4997 const char *errors,
4998 int *byteorder,
4999 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000{
5001 const char *starts = s;
5002 Py_ssize_t startinpos;
5003 Py_ssize_t endinpos;
5004 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005005 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005006 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007 int bo = 0; /* assume native ordering by default */
5008 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009 /* Offsets from q for retrieving bytes in the right order. */
5010#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5011 int iorder[] = {0, 1, 2, 3};
5012#else
5013 int iorder[] = {3, 2, 1, 0};
5014#endif
5015 PyObject *errorHandler = NULL;
5016 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005017
Walter Dörwald41980ca2007-08-16 21:55:45 +00005018 q = (unsigned char *)s;
5019 e = q + size;
5020
5021 if (byteorder)
5022 bo = *byteorder;
5023
5024 /* Check for BOM marks (U+FEFF) in the input and adjust current
5025 byte order setting accordingly. In native mode, the leading BOM
5026 mark is skipped, in all other modes, it is copied to the output
5027 stream as-is (giving a ZWNBSP character). */
5028 if (bo == 0) {
5029 if (size >= 4) {
5030 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 if (bom == 0x0000FEFF) {
5034 q += 4;
5035 bo = -1;
5036 }
5037 else if (bom == 0xFFFE0000) {
5038 q += 4;
5039 bo = 1;
5040 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 if (bom == 0x0000FEFF) {
5043 q += 4;
5044 bo = 1;
5045 }
5046 else if (bom == 0xFFFE0000) {
5047 q += 4;
5048 bo = -1;
5049 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052 }
5053
5054 if (bo == -1) {
5055 /* force LE */
5056 iorder[0] = 0;
5057 iorder[1] = 1;
5058 iorder[2] = 2;
5059 iorder[3] = 3;
5060 }
5061 else if (bo == 1) {
5062 /* force BE */
5063 iorder[0] = 3;
5064 iorder[1] = 2;
5065 iorder[2] = 1;
5066 iorder[3] = 0;
5067 }
5068
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005069 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005070 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005071 if (!unicode)
5072 return NULL;
5073 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005074 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005075 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005076
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 Py_UCS4 ch;
5079 /* remaining bytes at the end? (size should be divisible by 4) */
5080 if (e-q<4) {
5081 if (consumed)
5082 break;
5083 errmsg = "truncated data";
5084 startinpos = ((const char *)q)-starts;
5085 endinpos = ((const char *)e)-starts;
5086 goto utf32Error;
5087 /* The remaining input chars are ignored if the callback
5088 chooses to skip the input */
5089 }
5090 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5091 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 if (ch >= 0x110000)
5094 {
5095 errmsg = "codepoint not in range(0x110000)";
5096 startinpos = ((const char *)q)-starts;
5097 endinpos = startinpos+4;
5098 goto utf32Error;
5099 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005100 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5101 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 q += 4;
5103 continue;
5104 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 if (unicode_decode_call_errorhandler(
5106 errors, &errorHandler,
5107 "utf32", errmsg,
5108 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005109 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 }
5112
5113 if (byteorder)
5114 *byteorder = bo;
5115
5116 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118
5119 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005120 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121 goto onError;
5122
5123 Py_XDECREF(errorHandler);
5124 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005125 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128 Py_DECREF(unicode);
5129 Py_XDECREF(errorHandler);
5130 Py_XDECREF(exc);
5131 return NULL;
5132}
5133
5134PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005135_PyUnicode_EncodeUTF32(PyObject *str,
5136 const char *errors,
5137 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005139 int kind;
5140 void *data;
5141 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005142 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005144 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145 /* Offsets from p for storing byte pairs in the right order. */
5146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5147 int iorder[] = {0, 1, 2, 3};
5148#else
5149 int iorder[] = {3, 2, 1, 0};
5150#endif
5151
Benjamin Peterson29060642009-01-31 22:14:21 +00005152#define STORECHAR(CH) \
5153 do { \
5154 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5155 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5156 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5157 p[iorder[0]] = (CH) & 0xff; \
5158 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159 } while(0)
5160
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005161 if (!PyUnicode_Check(str)) {
5162 PyErr_BadArgument();
5163 return NULL;
5164 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005165 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005166 return NULL;
5167 kind = PyUnicode_KIND(str);
5168 data = PyUnicode_DATA(str);
5169 len = PyUnicode_GET_LENGTH(str);
5170
5171 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005172 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005174 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005175 if (v == NULL)
5176 return NULL;
5177
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005178 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005181 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005182 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005183
5184 if (byteorder == -1) {
5185 /* force LE */
5186 iorder[0] = 0;
5187 iorder[1] = 1;
5188 iorder[2] = 2;
5189 iorder[3] = 3;
5190 }
5191 else if (byteorder == 1) {
5192 /* force BE */
5193 iorder[0] = 3;
5194 iorder[1] = 2;
5195 iorder[2] = 1;
5196 iorder[3] = 0;
5197 }
5198
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005199 for (i = 0; i < len; i++)
5200 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005201
5202 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005203 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005204#undef STORECHAR
5205}
5206
Alexander Belopolsky40018472011-02-26 01:02:56 +00005207PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005208PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5209 Py_ssize_t size,
5210 const char *errors,
5211 int byteorder)
5212{
5213 PyObject *result;
5214 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5215 if (tmp == NULL)
5216 return NULL;
5217 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5218 Py_DECREF(tmp);
5219 return result;
5220}
5221
5222PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005223PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
Victor Stinnerb960b342011-11-20 19:12:52 +01005225 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226}
5227
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228/* --- UTF-16 Codec ------------------------------------------------------- */
5229
Tim Peters772747b2001-08-09 22:21:55 +00005230PyObject *
5231PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235{
Walter Dörwald69652032004-09-07 20:24:22 +00005236 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5237}
5238
5239PyObject *
5240PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 Py_ssize_t size,
5242 const char *errors,
5243 int *byteorder,
5244 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 Py_ssize_t startinpos;
5248 Py_ssize_t endinpos;
5249 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005250 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005252 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005253 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005254 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255 PyObject *errorHandler = NULL;
5256 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Tim Peters772747b2001-08-09 22:21:55 +00005258 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005259 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005262 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005264 /* Check for BOM marks (U+FEFF) in the input and adjust current
5265 byte order setting accordingly. In native mode, the leading BOM
5266 mark is skipped, in all other modes, it is copied to the output
5267 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005268 if (bo == 0 && size >= 2) {
5269 const Py_UCS4 bom = (q[1] << 8) | q[0];
5270 if (bom == 0xFEFF) {
5271 q += 2;
5272 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005274 else if (bom == 0xFFFE) {
5275 q += 2;
5276 bo = 1;
5277 }
5278 if (byteorder)
5279 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 if (q == e) {
5283 if (consumed)
5284 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005285 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005286 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005287
Antoine Pitrouab868312009-01-10 15:40:25 +00005288#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005289 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005290#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005291 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005292#endif
Tim Peters772747b2001-08-09 22:21:55 +00005293
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294 /* Note: size will always be longer than the resulting Unicode
5295 character count */
5296 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5297 if (!unicode)
5298 return NULL;
5299
5300 outpos = 0;
5301 while (1) {
5302 Py_UCS4 ch = 0;
5303 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005304 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005305 if (kind == PyUnicode_1BYTE_KIND) {
5306 if (PyUnicode_IS_ASCII(unicode))
5307 ch = asciilib_utf16_decode(&q, e,
5308 PyUnicode_1BYTE_DATA(unicode), &outpos,
5309 native_ordering);
5310 else
5311 ch = ucs1lib_utf16_decode(&q, e,
5312 PyUnicode_1BYTE_DATA(unicode), &outpos,
5313 native_ordering);
5314 } else if (kind == PyUnicode_2BYTE_KIND) {
5315 ch = ucs2lib_utf16_decode(&q, e,
5316 PyUnicode_2BYTE_DATA(unicode), &outpos,
5317 native_ordering);
5318 } else {
5319 assert(kind == PyUnicode_4BYTE_KIND);
5320 ch = ucs4lib_utf16_decode(&q, e,
5321 PyUnicode_4BYTE_DATA(unicode), &outpos,
5322 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005323 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005325
Antoine Pitrou63065d72012-05-15 23:48:04 +02005326 switch (ch)
5327 {
5328 case 0:
5329 /* remaining byte at the end? (size should be even) */
5330 if (q == e || consumed)
5331 goto End;
5332 errmsg = "truncated data";
5333 startinpos = ((const char *)q) - starts;
5334 endinpos = ((const char *)e) - starts;
5335 break;
5336 /* The remaining input chars are ignored if the callback
5337 chooses to skip the input */
5338 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005339 q -= 2;
5340 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005341 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005343 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 endinpos = ((const char *)e) - starts;
5345 break;
5346 case 2:
5347 errmsg = "illegal encoding";
5348 startinpos = ((const char *)q) - 2 - starts;
5349 endinpos = startinpos + 2;
5350 break;
5351 case 3:
5352 errmsg = "illegal UTF-16 surrogate";
5353 startinpos = ((const char *)q) - 4 - starts;
5354 endinpos = startinpos + 2;
5355 break;
5356 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005357 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5358 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 continue;
5360 }
5361
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005363 errors,
5364 &errorHandler,
5365 "utf16", errmsg,
5366 &starts,
5367 (const char **)&e,
5368 &startinpos,
5369 &endinpos,
5370 &exc,
5371 (const char **)&q,
5372 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005373 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 }
5376
Antoine Pitrou63065d72012-05-15 23:48:04 +02005377End:
Walter Dörwald69652032004-09-07 20:24:22 +00005378 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005382 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 goto onError;
5384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005385 Py_XDECREF(errorHandler);
5386 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005387 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 Py_XDECREF(errorHandler);
5392 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 return NULL;
5394}
5395
Tim Peters772747b2001-08-09 22:21:55 +00005396PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005397_PyUnicode_EncodeUTF16(PyObject *str,
5398 const char *errors,
5399 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005401 enum PyUnicode_Kind kind;
5402 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005404 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005405 unsigned short *out;
5406 Py_ssize_t bytesize;
5407 Py_ssize_t pairs;
5408#ifdef WORDS_BIGENDIAN
5409 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005410#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005412#endif
5413
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005414 if (!PyUnicode_Check(str)) {
5415 PyErr_BadArgument();
5416 return NULL;
5417 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005418 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 return NULL;
5420 kind = PyUnicode_KIND(str);
5421 data = PyUnicode_DATA(str);
5422 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005423
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005424 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005425 if (kind == PyUnicode_4BYTE_KIND) {
5426 const Py_UCS4 *in = (const Py_UCS4 *)data;
5427 const Py_UCS4 *end = in + len;
5428 while (in < end)
5429 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005430 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005431 }
5432 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005433 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005435 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 if (v == NULL)
5437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005439 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005440 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005441 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005443 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005445 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005446
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005447 switch (kind) {
5448 case PyUnicode_1BYTE_KIND: {
5449 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5450 break;
Tim Peters772747b2001-08-09 22:21:55 +00005451 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 case PyUnicode_2BYTE_KIND: {
5453 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5454 break;
Tim Peters772747b2001-08-09 22:21:55 +00005455 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005456 case PyUnicode_4BYTE_KIND: {
5457 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5458 break;
5459 }
5460 default:
5461 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005462 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005463
5464 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005465 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466}
5467
Alexander Belopolsky40018472011-02-26 01:02:56 +00005468PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005469PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5470 Py_ssize_t size,
5471 const char *errors,
5472 int byteorder)
5473{
5474 PyObject *result;
5475 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5476 if (tmp == NULL)
5477 return NULL;
5478 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5479 Py_DECREF(tmp);
5480 return result;
5481}
5482
5483PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005484PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005486 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487}
5488
5489/* --- Unicode Escape Codec ----------------------------------------------- */
5490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005491/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5492 if all the escapes in the string make it still a valid ASCII string.
5493 Returns -1 if any escapes were found which cause the string to
5494 pop out of ASCII range. Otherwise returns the length of the
5495 required buffer to hold the string.
5496 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005497static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005498length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5499{
5500 const unsigned char *p = (const unsigned char *)s;
5501 const unsigned char *end = p + size;
5502 Py_ssize_t length = 0;
5503
5504 if (size < 0)
5505 return -1;
5506
5507 for (; p < end; ++p) {
5508 if (*p > 127) {
5509 /* Non-ASCII */
5510 return -1;
5511 }
5512 else if (*p != '\\') {
5513 /* Normal character */
5514 ++length;
5515 }
5516 else {
5517 /* Backslash-escape, check next char */
5518 ++p;
5519 /* Escape sequence reaches till end of string or
5520 non-ASCII follow-up. */
5521 if (p >= end || *p > 127)
5522 return -1;
5523 switch (*p) {
5524 case '\n':
5525 /* backslash + \n result in zero characters */
5526 break;
5527 case '\\': case '\'': case '\"':
5528 case 'b': case 'f': case 't':
5529 case 'n': case 'r': case 'v': case 'a':
5530 ++length;
5531 break;
5532 case '0': case '1': case '2': case '3':
5533 case '4': case '5': case '6': case '7':
5534 case 'x': case 'u': case 'U': case 'N':
5535 /* these do not guarantee ASCII characters */
5536 return -1;
5537 default:
5538 /* count the backslash + the other character */
5539 length += 2;
5540 }
5541 }
5542 }
5543 return length;
5544}
5545
Fredrik Lundh06d12682001-01-24 07:59:11 +00005546static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005547
Alexander Belopolsky40018472011-02-26 01:02:56 +00005548PyObject *
5549PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005550 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005554 Py_ssize_t startinpos;
5555 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005556 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005558 char* message;
5559 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005560 PyObject *errorHandler = NULL;
5561 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005562 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005564
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005565 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566
5567 /* After length_of_escaped_ascii_string() there are two alternatives,
5568 either the string is pure ASCII with named escapes like \n, etc.
5569 and we determined it's exact size (common case)
5570 or it contains \x, \u, ... escape sequences. then we create a
5571 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005572 if (len >= 0) {
5573 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005574 if (!v)
5575 goto onError;
5576 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 }
5578 else {
5579 /* Escaped strings will always be longer than the resulting
5580 Unicode string, so we start with size here and then reduce the
5581 length after conversion to the true value.
5582 (but if the error callback returns a long replacement string
5583 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 if (!v)
5586 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005588 }
5589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005591 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005592 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005594
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 while (s < end) {
5596 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005597 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005600 /* The only case in which i == ascii_length is a backslash
5601 followed by a newline. */
5602 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604 /* Non-escape characters are interpreted as Unicode ordinals */
5605 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005606 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 continue;
5609 }
5610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 /* \ - Escapes */
5613 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005614 c = *s++;
5615 if (s > end)
5616 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005618 /* The only case in which i == ascii_length is a backslash
5619 followed by a newline. */
5620 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005622 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005625#define WRITECHAR(ch) \
5626 do { \
5627 if (unicode_putchar(&v, &i, ch) < 0) \
5628 goto onError; \
5629 }while(0)
5630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005632 case '\\': WRITECHAR('\\'); break;
5633 case '\'': WRITECHAR('\''); break;
5634 case '\"': WRITECHAR('\"'); break;
5635 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 case 'f': WRITECHAR('\014'); break;
5638 case 't': WRITECHAR('\t'); break;
5639 case 'n': WRITECHAR('\n'); break;
5640 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005641 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005644 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645
Benjamin Peterson29060642009-01-31 22:14:21 +00005646 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 case '0': case '1': case '2': case '3':
5648 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005649 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005650 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005651 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005652 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005653 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 break;
5657
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 /* hex escapes */
5659 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005661 digits = 2;
5662 message = "truncated \\xXX escape";
5663 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005667 digits = 4;
5668 message = "truncated \\uXXXX escape";
5669 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005672 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005673 digits = 8;
5674 message = "truncated \\UXXXXXXXX escape";
5675 hexescape:
5676 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005677 if (end - s < digits) {
5678 /* count only hex digits */
5679 for (; s < end; ++s) {
5680 c = (unsigned char)*s;
5681 if (!Py_ISXDIGIT(c))
5682 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005684 goto error;
5685 }
5686 for (; digits--; ++s) {
5687 c = (unsigned char)*s;
5688 if (!Py_ISXDIGIT(c))
5689 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005690 chr = (chr<<4) & ~0xF;
5691 if (c >= '0' && c <= '9')
5692 chr += c - '0';
5693 else if (c >= 'a' && c <= 'f')
5694 chr += 10 + c - 'a';
5695 else
5696 chr += 10 + c - 'A';
5697 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005698 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 /* _decoding_error will have already written into the
5700 target buffer. */
5701 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005702 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005703 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005704 message = "illegal Unicode character";
5705 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005706 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005707 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005708 break;
5709
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005711 case 'N':
5712 message = "malformed \\N character escape";
5713 if (ucnhash_CAPI == NULL) {
5714 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005715 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5716 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005717 if (ucnhash_CAPI == NULL)
5718 goto ucnhashError;
5719 }
5720 if (*s == '{') {
5721 const char *start = s+1;
5722 /* look for the closing brace */
5723 while (*s != '}' && s < end)
5724 s++;
5725 if (s > start && s < end && *s == '}') {
5726 /* found a name. look it up in the unicode database */
5727 message = "unknown Unicode character name";
5728 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005729 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005730 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005731 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005732 goto store;
5733 }
5734 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005735 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005736
5737 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005738 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 message = "\\ at end of string";
5740 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005741 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005742 }
5743 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005744 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005745 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005746 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005747 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005749 continue;
5750
5751 error:
5752 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005753 if (unicode_decode_call_errorhandler(
5754 errors, &errorHandler,
5755 "unicodeescape", message,
5756 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005757 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005758 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005759 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005760 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005762#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763
Victor Stinner16e6a802011-12-12 13:24:15 +01005764 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005765 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005766 Py_XDECREF(errorHandler);
5767 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005768 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005769
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005771 PyErr_SetString(
5772 PyExc_UnicodeError,
5773 "\\N escapes not supported (can't load unicodedata module)"
5774 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005775 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 Py_XDECREF(errorHandler);
5777 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005778 return NULL;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 Py_XDECREF(errorHandler);
5783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 return NULL;
5785}
5786
5787/* Return a Unicode-Escape string version of the Unicode object.
5788
5789 If quotes is true, the string is enclosed in u"" or u'' quotes as
5790 appropriate.
5791
5792*/
5793
Alexander Belopolsky40018472011-02-26 01:02:56 +00005794PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005797 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005798 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005800 int kind;
5801 void *data;
5802 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Ezio Melottie7f90372012-10-05 03:33:31 +03005804 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005805 escape.
5806
Ezio Melottie7f90372012-10-05 03:33:31 +03005807 For UCS1 strings it's '\xxx', 4 bytes per source character.
5808 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5809 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005810 */
5811
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005812 if (!PyUnicode_Check(unicode)) {
5813 PyErr_BadArgument();
5814 return NULL;
5815 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005816 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005817 return NULL;
5818 len = PyUnicode_GET_LENGTH(unicode);
5819 kind = PyUnicode_KIND(unicode);
5820 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005821 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005822 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5823 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5824 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5825 }
5826
5827 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 return PyBytes_FromStringAndSize(NULL, 0);
5829
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005830 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005832
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005833 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005835 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 if (repr == NULL)
5838 return NULL;
5839
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005840 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005842 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005843 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005844
Walter Dörwald79e913e2007-05-12 11:08:06 +00005845 /* Escape backslashes */
5846 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 *p++ = '\\';
5848 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005849 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005850 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005851
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005852 /* Map 21-bit characters to '\U00xxxxxx' */
5853 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005854 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005855 *p++ = '\\';
5856 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005857 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5858 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5859 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5860 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5861 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5862 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5863 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5864 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005866 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005867
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005869 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 *p++ = '\\';
5871 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005872 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5873 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5874 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5875 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005877
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005878 /* Map special whitespace to '\t', \n', '\r' */
5879 else if (ch == '\t') {
5880 *p++ = '\\';
5881 *p++ = 't';
5882 }
5883 else if (ch == '\n') {
5884 *p++ = '\\';
5885 *p++ = 'n';
5886 }
5887 else if (ch == '\r') {
5888 *p++ = '\\';
5889 *p++ = 'r';
5890 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005891
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005892 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005893 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005895 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005896 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5897 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005898 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005899
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 /* Copy everything else as-is */
5901 else
5902 *p++ = (char) ch;
5903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005905 assert(p - PyBytes_AS_STRING(repr) > 0);
5906 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5907 return NULL;
5908 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909}
5910
Alexander Belopolsky40018472011-02-26 01:02:56 +00005911PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5913 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 PyObject *result;
5916 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5917 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 result = PyUnicode_AsUnicodeEscapeString(tmp);
5920 Py_DECREF(tmp);
5921 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922}
5923
5924/* --- Raw Unicode Escape Codec ------------------------------------------- */
5925
Alexander Belopolsky40018472011-02-26 01:02:56 +00005926PyObject *
5927PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005928 Py_ssize_t size,
5929 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005932 Py_ssize_t startinpos;
5933 Py_ssize_t endinpos;
5934 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005935 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 const char *end;
5937 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 PyObject *errorHandler = NULL;
5939 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005940
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 /* Escaped strings will always be longer than the resulting
5942 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 length after conversion to the true value. (But decoding error
5944 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005949 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005950 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 end = s + size;
5952 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 unsigned char c;
5954 Py_UCS4 x;
5955 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005956 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 /* Non-escape characters are interpreted as Unicode ordinals */
5959 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005960 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5961 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005963 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 startinpos = s-starts;
5965
5966 /* \u-escapes are only interpreted iff the number of leading
5967 backslashes if odd */
5968 bs = s;
5969 for (;s < end;) {
5970 if (*s != '\\')
5971 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005972 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5973 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 }
5975 if (((s - bs) & 1) == 0 ||
5976 s >= end ||
5977 (*s != 'u' && *s != 'U')) {
5978 continue;
5979 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005980 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 count = *s=='u' ? 4 : 8;
5982 s++;
5983
5984 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 for (x = 0, i = 0; i < count; ++i, ++s) {
5986 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005987 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 endinpos = s-starts;
5989 if (unicode_decode_call_errorhandler(
5990 errors, &errorHandler,
5991 "rawunicodeescape", "truncated \\uXXXX",
5992 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005993 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 goto onError;
5995 goto nextByte;
5996 }
5997 x = (x<<4) & ~0xF;
5998 if (c >= '0' && c <= '9')
5999 x += c - '0';
6000 else if (c >= 'a' && c <= 'f')
6001 x += 10 + c - 'a';
6002 else
6003 x += 10 + c - 'A';
6004 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006005 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006006 if (unicode_putchar(&v, &outpos, x) < 0)
6007 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006008 } else {
6009 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006010 if (unicode_decode_call_errorhandler(
6011 errors, &errorHandler,
6012 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006014 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006016 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 nextByte:
6018 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006020 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 Py_XDECREF(errorHandler);
6023 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006024 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028 Py_XDECREF(errorHandler);
6029 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 return NULL;
6031}
6032
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006035PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006037 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 char *p;
6039 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 Py_ssize_t expandsize, pos;
6041 int kind;
6042 void *data;
6043 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 if (!PyUnicode_Check(unicode)) {
6046 PyErr_BadArgument();
6047 return NULL;
6048 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006049 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 return NULL;
6051 kind = PyUnicode_KIND(unicode);
6052 data = PyUnicode_DATA(unicode);
6053 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006054 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6055 bytes, and 1 byte characters 4. */
6056 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006057
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006058 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006060
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006061 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 if (repr == NULL)
6063 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006064 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006065 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006067 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006068 for (pos = 0; pos < len; pos++) {
6069 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 /* Map 32-bit characters to '\Uxxxxxxxx' */
6071 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006072 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006073 *p++ = '\\';
6074 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006075 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6076 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6077 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6078 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6079 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6080 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6081 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6082 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006085 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 *p++ = '\\';
6087 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006088 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6089 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6090 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6091 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 /* Copy everything else as-is */
6094 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 *p++ = (char) ch;
6096 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006097
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006098 assert(p > q);
6099 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006100 return NULL;
6101 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102}
6103
Alexander Belopolsky40018472011-02-26 01:02:56 +00006104PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006105PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6106 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108 PyObject *result;
6109 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6110 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006111 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6113 Py_DECREF(tmp);
6114 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115}
6116
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006117/* --- Unicode Internal Codec ------------------------------------------- */
6118
Alexander Belopolsky40018472011-02-26 01:02:56 +00006119PyObject *
6120_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006121 Py_ssize_t size,
6122 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006123{
6124 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006125 Py_ssize_t startinpos;
6126 Py_ssize_t endinpos;
6127 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006128 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006129 const char *end;
6130 const char *reason;
6131 PyObject *errorHandler = NULL;
6132 PyObject *exc = NULL;
6133
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006134 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006135 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006136 1))
6137 return NULL;
6138
Thomas Wouters89f507f2006-12-13 04:49:30 +00006139 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006140 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006141 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006143 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006144 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006145 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006146 end = s + size;
6147
6148 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006149 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006150 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006151 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006152 endinpos = end-starts;
6153 reason = "truncated input";
6154 goto error;
6155 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006156 /* We copy the raw representation one byte at a time because the
6157 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006158 ((char *) &uch)[0] = s[0];
6159 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006160#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006161 ((char *) &uch)[2] = s[2];
6162 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006163#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006164 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006165#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006166 /* We have to sanity check the raw data, otherwise doom looms for
6167 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006168 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006169 endinpos = s - starts + Py_UNICODE_SIZE;
6170 reason = "illegal code point (> 0x10FFFF)";
6171 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006172 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006173#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006174 s += Py_UNICODE_SIZE;
6175#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006176 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006177 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006178 Py_UNICODE uch2;
6179 ((char *) &uch2)[0] = s[0];
6180 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006181 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006182 {
Victor Stinner551ac952011-11-29 22:58:13 +01006183 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006184 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006185 }
6186 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006187#endif
6188
6189 if (unicode_putchar(&v, &outpos, ch) < 0)
6190 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006191 continue;
6192
6193 error:
6194 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006195 if (unicode_decode_call_errorhandler(
6196 errors, &errorHandler,
6197 "unicode_internal", reason,
6198 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006199 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006200 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201 }
6202
Victor Stinner16e6a802011-12-12 13:24:15 +01006203 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006204 goto onError;
6205 Py_XDECREF(errorHandler);
6206 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006207 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210 Py_XDECREF(v);
6211 Py_XDECREF(errorHandler);
6212 Py_XDECREF(exc);
6213 return NULL;
6214}
6215
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216/* --- Latin-1 Codec ------------------------------------------------------ */
6217
Alexander Belopolsky40018472011-02-26 01:02:56 +00006218PyObject *
6219PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006220 Py_ssize_t size,
6221 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006224 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225}
6226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006227/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006228static void
6229make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006230 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006231 PyObject *unicode,
6232 Py_ssize_t startpos, Py_ssize_t endpos,
6233 const char *reason)
6234{
6235 if (*exceptionObject == NULL) {
6236 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006237 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006238 encoding, unicode, startpos, endpos, reason);
6239 }
6240 else {
6241 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6242 goto onError;
6243 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6244 goto onError;
6245 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6246 goto onError;
6247 return;
6248 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006249 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006250 }
6251}
6252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006254static void
6255raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006256 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006257 PyObject *unicode,
6258 Py_ssize_t startpos, Py_ssize_t endpos,
6259 const char *reason)
6260{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006261 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006262 encoding, unicode, startpos, endpos, reason);
6263 if (*exceptionObject != NULL)
6264 PyCodec_StrictErrors(*exceptionObject);
6265}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266
6267/* error handling callback helper:
6268 build arguments, call the callback and check the arguments,
6269 put the result into newpos and return the replacement string, which
6270 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006271static PyObject *
6272unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006273 PyObject **errorHandler,
6274 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006276 Py_ssize_t startpos, Py_ssize_t endpos,
6277 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006279 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006280 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 PyObject *restuple;
6282 PyObject *resunicode;
6283
6284 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006286 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006288 }
6289
Benjamin Petersonbac79492012-01-14 13:34:47 -05006290 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291 return NULL;
6292 len = PyUnicode_GET_LENGTH(unicode);
6293
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006294 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298
6299 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006304 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 Py_DECREF(restuple);
6306 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006308 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 &resunicode, newpos)) {
6310 Py_DECREF(restuple);
6311 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006312 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006313 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6314 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6315 Py_DECREF(restuple);
6316 return NULL;
6317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006319 *newpos = len + *newpos;
6320 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6322 Py_DECREF(restuple);
6323 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006324 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325 Py_INCREF(resunicode);
6326 Py_DECREF(restuple);
6327 return resunicode;
6328}
6329
Alexander Belopolsky40018472011-02-26 01:02:56 +00006330static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006332 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006333 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 /* input state */
6336 Py_ssize_t pos=0, size;
6337 int kind;
6338 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 /* output object */
6340 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341 /* pointer into the output */
6342 char *str;
6343 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006344 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006345 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6346 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 PyObject *errorHandler = NULL;
6348 PyObject *exc = NULL;
6349 /* the following variable is used for caching string comparisons
6350 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6351 int known_errorHandler = -1;
6352
Benjamin Petersonbac79492012-01-14 13:34:47 -05006353 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 return NULL;
6355 size = PyUnicode_GET_LENGTH(unicode);
6356 kind = PyUnicode_KIND(unicode);
6357 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 /* allocate enough for a simple encoding without
6359 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006360 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006361 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006362 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006364 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006365 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366 ressize = size;
6367
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 while (pos < size) {
6369 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 /* can we encode this? */
6372 if (c<limit) {
6373 /* no overflow check, because we know that the space is enough */
6374 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006375 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006376 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 Py_ssize_t requiredsize;
6379 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006380 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006382 Py_ssize_t collstart = pos;
6383 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006385 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 ++collend;
6387 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6388 if (known_errorHandler==-1) {
6389 if ((errors==NULL) || (!strcmp(errors, "strict")))
6390 known_errorHandler = 1;
6391 else if (!strcmp(errors, "replace"))
6392 known_errorHandler = 2;
6393 else if (!strcmp(errors, "ignore"))
6394 known_errorHandler = 3;
6395 else if (!strcmp(errors, "xmlcharrefreplace"))
6396 known_errorHandler = 4;
6397 else
6398 known_errorHandler = 0;
6399 }
6400 switch (known_errorHandler) {
6401 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006402 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 goto onError;
6404 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006405 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 *str++ = '?'; /* fall through */
6407 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 break;
6410 case 4: /* xmlcharrefreplace */
6411 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006412 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006414 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006416 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006417 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006418 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006419 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006420 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006421 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006422 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006423 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006424 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006425 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006426 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006427 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006428 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006429 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006430 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006431 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006432 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006433 if (requiredsize > PY_SSIZE_T_MAX - incr)
6434 goto overflow;
6435 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006437 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6438 goto overflow;
6439 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006441 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 requiredsize = 2*ressize;
6443 if (_PyBytes_Resize(&res, requiredsize))
6444 goto onError;
6445 str = PyBytes_AS_STRING(res) + respos;
6446 ressize = requiredsize;
6447 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006448 /* generate replacement */
6449 for (i = collstart; i < collend; ++i) {
6450 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 break;
6454 default:
6455 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456 encoding, reason, unicode, &exc,
6457 collstart, collend, &newpos);
6458 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006459 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006461 if (PyBytes_Check(repunicode)) {
6462 /* Directly copy bytes result to output. */
6463 repsize = PyBytes_Size(repunicode);
6464 if (repsize > 1) {
6465 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006466 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006467 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6468 Py_DECREF(repunicode);
6469 goto overflow;
6470 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006471 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6472 Py_DECREF(repunicode);
6473 goto onError;
6474 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006475 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006476 ressize += repsize-1;
6477 }
6478 memcpy(str, PyBytes_AsString(repunicode), repsize);
6479 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006480 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006481 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006482 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006483 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 /* need more space? (at least enough for what we
6485 have+the replacement+the rest of the string, so
6486 we won't have to check space for encodable characters) */
6487 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006489 requiredsize = respos;
6490 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6491 goto overflow;
6492 requiredsize += repsize;
6493 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6494 goto overflow;
6495 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006497 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 requiredsize = 2*ressize;
6499 if (_PyBytes_Resize(&res, requiredsize)) {
6500 Py_DECREF(repunicode);
6501 goto onError;
6502 }
6503 str = PyBytes_AS_STRING(res) + respos;
6504 ressize = requiredsize;
6505 }
6506 /* check if there is anything unencodable in the replacement
6507 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 for (i = 0; repsize-->0; ++i, ++str) {
6509 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006511 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 Py_DECREF(repunicode);
6514 goto onError;
6515 }
6516 *str = (char)c;
6517 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006519 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006521 }
6522 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006523 /* Resize if we allocated to much */
6524 size = str - PyBytes_AS_STRING(res);
6525 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006526 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006527 if (_PyBytes_Resize(&res, size) < 0)
6528 goto onError;
6529 }
6530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006531 Py_XDECREF(errorHandler);
6532 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006533 return res;
6534
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006535 overflow:
6536 PyErr_SetString(PyExc_OverflowError,
6537 "encoded result is too long for a Python string");
6538
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006539 onError:
6540 Py_XDECREF(res);
6541 Py_XDECREF(errorHandler);
6542 Py_XDECREF(exc);
6543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544}
6545
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006547PyObject *
6548PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006549 Py_ssize_t size,
6550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 PyObject *result;
6553 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6554 if (unicode == NULL)
6555 return NULL;
6556 result = unicode_encode_ucs1(unicode, errors, 256);
6557 Py_DECREF(unicode);
6558 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559}
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006562_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
6564 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 PyErr_BadArgument();
6566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006568 if (PyUnicode_READY(unicode) == -1)
6569 return NULL;
6570 /* Fast path: if it is a one-byte string, construct
6571 bytes object directly. */
6572 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6573 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6574 PyUnicode_GET_LENGTH(unicode));
6575 /* Non-Latin-1 characters present. Defer to above function to
6576 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006578}
6579
6580PyObject*
6581PyUnicode_AsLatin1String(PyObject *unicode)
6582{
6583 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584}
6585
6586/* --- 7-bit ASCII Codec -------------------------------------------------- */
6587
Alexander Belopolsky40018472011-02-26 01:02:56 +00006588PyObject *
6589PyUnicode_DecodeASCII(const char *s,
6590 Py_ssize_t size,
6591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006594 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006595 int kind;
6596 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006597 Py_ssize_t startinpos;
6598 Py_ssize_t endinpos;
6599 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 const char *e;
6601 PyObject *errorHandler = NULL;
6602 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006605 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006608 if (size == 1 && (unsigned char)s[0] < 128)
6609 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006610
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006611 unicode = PyUnicode_New(size, 127);
6612 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006614
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006616 data = PyUnicode_1BYTE_DATA(unicode);
6617 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6618 if (outpos == size)
6619 return unicode;
6620
6621 s += outpos;
6622 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006623 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 register unsigned char c = (unsigned char)*s;
6625 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006626 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 ++s;
6628 }
6629 else {
6630 startinpos = s-starts;
6631 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 if (unicode_decode_call_errorhandler(
6633 errors, &errorHandler,
6634 "ascii", "ordinal not in range(128)",
6635 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006636 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006638 kind = PyUnicode_KIND(unicode);
6639 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006642 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006643 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 Py_XDECREF(errorHandler);
6645 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006646 assert(_PyUnicode_CheckConsistency(unicode, 1));
6647 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006650 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 Py_XDECREF(errorHandler);
6652 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 return NULL;
6654}
6655
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657PyObject *
6658PyUnicode_EncodeASCII(const Py_UNICODE *p,
6659 Py_ssize_t size,
6660 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006662 PyObject *result;
6663 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6664 if (unicode == NULL)
6665 return NULL;
6666 result = unicode_encode_ucs1(unicode, errors, 128);
6667 Py_DECREF(unicode);
6668 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669}
6670
Alexander Belopolsky40018472011-02-26 01:02:56 +00006671PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006672_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673{
6674 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 PyErr_BadArgument();
6676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678 if (PyUnicode_READY(unicode) == -1)
6679 return NULL;
6680 /* Fast path: if it is an ASCII-only string, construct bytes object
6681 directly. Else defer to above function to raise the exception. */
6682 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6683 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6684 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686}
6687
6688PyObject *
6689PyUnicode_AsASCIIString(PyObject *unicode)
6690{
6691 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
Victor Stinner99b95382011-07-04 14:23:54 +02006694#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006695
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006696/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006697
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006698#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699#define NEED_RETRY
6700#endif
6701
Victor Stinner3a50e702011-10-18 21:21:00 +02006702#ifndef WC_ERR_INVALID_CHARS
6703# define WC_ERR_INVALID_CHARS 0x0080
6704#endif
6705
6706static char*
6707code_page_name(UINT code_page, PyObject **obj)
6708{
6709 *obj = NULL;
6710 if (code_page == CP_ACP)
6711 return "mbcs";
6712 if (code_page == CP_UTF7)
6713 return "CP_UTF7";
6714 if (code_page == CP_UTF8)
6715 return "CP_UTF8";
6716
6717 *obj = PyBytes_FromFormat("cp%u", code_page);
6718 if (*obj == NULL)
6719 return NULL;
6720 return PyBytes_AS_STRING(*obj);
6721}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006724is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725{
6726 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006727 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006728
Victor Stinner3a50e702011-10-18 21:21:00 +02006729 if (!IsDBCSLeadByteEx(code_page, *curr))
6730 return 0;
6731
6732 prev = CharPrevExA(code_page, s, curr, 0);
6733 if (prev == curr)
6734 return 1;
6735 /* FIXME: This code is limited to "true" double-byte encodings,
6736 as it assumes an incomplete character consists of a single
6737 byte. */
6738 if (curr - prev == 2)
6739 return 1;
6740 if (!IsDBCSLeadByteEx(code_page, *prev))
6741 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006742 return 0;
6743}
6744
Victor Stinner3a50e702011-10-18 21:21:00 +02006745static DWORD
6746decode_code_page_flags(UINT code_page)
6747{
6748 if (code_page == CP_UTF7) {
6749 /* The CP_UTF7 decoder only supports flags=0 */
6750 return 0;
6751 }
6752 else
6753 return MB_ERR_INVALID_CHARS;
6754}
6755
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006757 * Decode a byte string from a Windows code page into unicode object in strict
6758 * mode.
6759 *
6760 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6761 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006762 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006764decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006765 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006766 const char *in,
6767 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768{
Victor Stinner3a50e702011-10-18 21:21:00 +02006769 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006770 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006771 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006772
6773 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006774 assert(insize > 0);
6775 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6776 if (outsize <= 0)
6777 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778
6779 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006781 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 if (*v == NULL)
6784 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006785 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006786 }
6787 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006789 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006790 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006792 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006793 }
6794
6795 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006796 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6797 if (outsize <= 0)
6798 goto error;
6799 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006800
Victor Stinner3a50e702011-10-18 21:21:00 +02006801error:
6802 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6803 return -2;
6804 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006805 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806}
6807
Victor Stinner3a50e702011-10-18 21:21:00 +02006808/*
6809 * Decode a byte string from a code page into unicode object with an error
6810 * handler.
6811 *
6812 * Returns consumed size if succeed, or raise a WindowsError or
6813 * UnicodeDecodeError exception and returns -1 on error.
6814 */
6815static int
6816decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006817 PyObject **v,
6818 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006819 const char *errors)
6820{
6821 const char *startin = in;
6822 const char *endin = in + size;
6823 const DWORD flags = decode_code_page_flags(code_page);
6824 /* Ideally, we should get reason from FormatMessage. This is the Windows
6825 2000 English version of the message. */
6826 const char *reason = "No mapping for the Unicode character exists "
6827 "in the target code page.";
6828 /* each step cannot decode more than 1 character, but a character can be
6829 represented as a surrogate pair */
6830 wchar_t buffer[2], *startout, *out;
6831 int insize, outsize;
6832 PyObject *errorHandler = NULL;
6833 PyObject *exc = NULL;
6834 PyObject *encoding_obj = NULL;
6835 char *encoding;
6836 DWORD err;
6837 int ret = -1;
6838
6839 assert(size > 0);
6840
6841 encoding = code_page_name(code_page, &encoding_obj);
6842 if (encoding == NULL)
6843 return -1;
6844
6845 if (errors == NULL || strcmp(errors, "strict") == 0) {
6846 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6847 UnicodeDecodeError. */
6848 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6849 if (exc != NULL) {
6850 PyCodec_StrictErrors(exc);
6851 Py_CLEAR(exc);
6852 }
6853 goto error;
6854 }
6855
6856 if (*v == NULL) {
6857 /* Create unicode object */
6858 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6859 PyErr_NoMemory();
6860 goto error;
6861 }
Victor Stinnerab595942011-12-17 04:59:06 +01006862 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006863 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 if (*v == NULL)
6865 goto error;
6866 startout = PyUnicode_AS_UNICODE(*v);
6867 }
6868 else {
6869 /* Extend unicode object */
6870 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6871 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6872 PyErr_NoMemory();
6873 goto error;
6874 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006875 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 goto error;
6877 startout = PyUnicode_AS_UNICODE(*v) + n;
6878 }
6879
6880 /* Decode the byte string character per character */
6881 out = startout;
6882 while (in < endin)
6883 {
6884 /* Decode a character */
6885 insize = 1;
6886 do
6887 {
6888 outsize = MultiByteToWideChar(code_page, flags,
6889 in, insize,
6890 buffer, Py_ARRAY_LENGTH(buffer));
6891 if (outsize > 0)
6892 break;
6893 err = GetLastError();
6894 if (err != ERROR_NO_UNICODE_TRANSLATION
6895 && err != ERROR_INSUFFICIENT_BUFFER)
6896 {
6897 PyErr_SetFromWindowsErr(0);
6898 goto error;
6899 }
6900 insize++;
6901 }
6902 /* 4=maximum length of a UTF-8 sequence */
6903 while (insize <= 4 && (in + insize) <= endin);
6904
6905 if (outsize <= 0) {
6906 Py_ssize_t startinpos, endinpos, outpos;
6907
6908 startinpos = in - startin;
6909 endinpos = startinpos + 1;
6910 outpos = out - PyUnicode_AS_UNICODE(*v);
6911 if (unicode_decode_call_errorhandler(
6912 errors, &errorHandler,
6913 encoding, reason,
6914 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006915 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 {
6917 goto error;
6918 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006919 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 }
6921 else {
6922 in += insize;
6923 memcpy(out, buffer, outsize * sizeof(wchar_t));
6924 out += outsize;
6925 }
6926 }
6927
6928 /* write a NUL character at the end */
6929 *out = 0;
6930
6931 /* Extend unicode object */
6932 outsize = out - startout;
6933 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006934 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006936 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006937
6938error:
6939 Py_XDECREF(encoding_obj);
6940 Py_XDECREF(errorHandler);
6941 Py_XDECREF(exc);
6942 return ret;
6943}
6944
Victor Stinner3a50e702011-10-18 21:21:00 +02006945static PyObject *
6946decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006947 const char *s, Py_ssize_t size,
6948 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949{
Victor Stinner76a31a62011-11-04 00:05:13 +01006950 PyObject *v = NULL;
6951 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952
Victor Stinner3a50e702011-10-18 21:21:00 +02006953 if (code_page < 0) {
6954 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6955 return NULL;
6956 }
6957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960
Victor Stinner76a31a62011-11-04 00:05:13 +01006961 do
6962 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006964 if (size > INT_MAX) {
6965 chunk_size = INT_MAX;
6966 final = 0;
6967 done = 0;
6968 }
6969 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006971 {
6972 chunk_size = (int)size;
6973 final = (consumed == NULL);
6974 done = 1;
6975 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006976
Victor Stinner76a31a62011-11-04 00:05:13 +01006977 /* Skip trailing lead-byte unless 'final' is set */
6978 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6979 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
Victor Stinner76a31a62011-11-04 00:05:13 +01006981 if (chunk_size == 0 && done) {
6982 if (v != NULL)
6983 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006984 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006985 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986
Victor Stinner76a31a62011-11-04 00:05:13 +01006987
6988 converted = decode_code_page_strict(code_page, &v,
6989 s, chunk_size);
6990 if (converted == -2)
6991 converted = decode_code_page_errors(code_page, &v,
6992 s, chunk_size,
6993 errors);
6994 assert(converted != 0);
6995
6996 if (converted < 0) {
6997 Py_XDECREF(v);
6998 return NULL;
6999 }
7000
7001 if (consumed)
7002 *consumed += converted;
7003
7004 s += converted;
7005 size -= converted;
7006 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007007
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007008 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007009}
7010
Alexander Belopolsky40018472011-02-26 01:02:56 +00007011PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007012PyUnicode_DecodeCodePageStateful(int code_page,
7013 const char *s,
7014 Py_ssize_t size,
7015 const char *errors,
7016 Py_ssize_t *consumed)
7017{
7018 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7019}
7020
7021PyObject *
7022PyUnicode_DecodeMBCSStateful(const char *s,
7023 Py_ssize_t size,
7024 const char *errors,
7025 Py_ssize_t *consumed)
7026{
7027 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7028}
7029
7030PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007031PyUnicode_DecodeMBCS(const char *s,
7032 Py_ssize_t size,
7033 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007034{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7036}
7037
Victor Stinner3a50e702011-10-18 21:21:00 +02007038static DWORD
7039encode_code_page_flags(UINT code_page, const char *errors)
7040{
7041 if (code_page == CP_UTF8) {
7042 if (winver.dwMajorVersion >= 6)
7043 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7044 and later */
7045 return WC_ERR_INVALID_CHARS;
7046 else
7047 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7048 return 0;
7049 }
7050 else if (code_page == CP_UTF7) {
7051 /* CP_UTF7 only supports flags=0 */
7052 return 0;
7053 }
7054 else {
7055 if (errors != NULL && strcmp(errors, "replace") == 0)
7056 return 0;
7057 else
7058 return WC_NO_BEST_FIT_CHARS;
7059 }
7060}
7061
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 * Encode a Unicode string to a Windows code page into a byte string in strict
7064 * mode.
7065 *
7066 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7067 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007069static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007070encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007071 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073{
Victor Stinner554f3f02010-06-16 23:33:54 +00007074 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 BOOL *pusedDefaultChar = &usedDefaultChar;
7076 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007077 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007078 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007079 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 const DWORD flags = encode_code_page_flags(code_page, NULL);
7081 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007082 /* Create a substring so that we can get the UTF-16 representation
7083 of just the slice under consideration. */
7084 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Martin v. Löwis3d325192011-11-04 18:23:06 +01007086 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007087
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007089 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007091 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007092
Victor Stinner2fc507f2011-11-04 20:06:39 +01007093 substring = PyUnicode_Substring(unicode, offset, offset+len);
7094 if (substring == NULL)
7095 return -1;
7096 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7097 if (p == NULL) {
7098 Py_DECREF(substring);
7099 return -1;
7100 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007101
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007102 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 outsize = WideCharToMultiByte(code_page, flags,
7104 p, size,
7105 NULL, 0,
7106 NULL, pusedDefaultChar);
7107 if (outsize <= 0)
7108 goto error;
7109 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007110 if (pusedDefaultChar && *pusedDefaultChar) {
7111 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007113 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007114
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007118 if (*outbytes == NULL) {
7119 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007121 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123 }
7124 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 const Py_ssize_t n = PyBytes_Size(*outbytes);
7127 if (outsize > PY_SSIZE_T_MAX - n) {
7128 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007129 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007132 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7133 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007135 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
7138
7139 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 outsize = WideCharToMultiByte(code_page, flags,
7141 p, size,
7142 out, outsize,
7143 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007144 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 if (outsize <= 0)
7146 goto error;
7147 if (pusedDefaultChar && *pusedDefaultChar)
7148 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007150
Victor Stinner3a50e702011-10-18 21:21:00 +02007151error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007152 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7154 return -2;
7155 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007156 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007157}
7158
Victor Stinner3a50e702011-10-18 21:21:00 +02007159/*
7160 * Encode a Unicode string to a Windows code page into a byte string using a
7161 * error handler.
7162 *
7163 * Returns consumed characters if succeed, or raise a WindowsError and returns
7164 * -1 on other error.
7165 */
7166static int
7167encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007168 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007169 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007170{
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007172 Py_ssize_t pos = unicode_offset;
7173 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 /* Ideally, we should get reason from FormatMessage. This is the Windows
7175 2000 English version of the message. */
7176 const char *reason = "invalid character";
7177 /* 4=maximum length of a UTF-8 sequence */
7178 char buffer[4];
7179 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7180 Py_ssize_t outsize;
7181 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 PyObject *errorHandler = NULL;
7183 PyObject *exc = NULL;
7184 PyObject *encoding_obj = NULL;
7185 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007186 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 PyObject *rep;
7188 int ret = -1;
7189
7190 assert(insize > 0);
7191
7192 encoding = code_page_name(code_page, &encoding_obj);
7193 if (encoding == NULL)
7194 return -1;
7195
7196 if (errors == NULL || strcmp(errors, "strict") == 0) {
7197 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7198 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007199 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 if (exc != NULL) {
7201 PyCodec_StrictErrors(exc);
7202 Py_DECREF(exc);
7203 }
7204 Py_XDECREF(encoding_obj);
7205 return -1;
7206 }
7207
7208 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7209 pusedDefaultChar = &usedDefaultChar;
7210 else
7211 pusedDefaultChar = NULL;
7212
7213 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7214 PyErr_NoMemory();
7215 goto error;
7216 }
7217 outsize = insize * Py_ARRAY_LENGTH(buffer);
7218
7219 if (*outbytes == NULL) {
7220 /* Create string object */
7221 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7222 if (*outbytes == NULL)
7223 goto error;
7224 out = PyBytes_AS_STRING(*outbytes);
7225 }
7226 else {
7227 /* Extend string object */
7228 Py_ssize_t n = PyBytes_Size(*outbytes);
7229 if (n > PY_SSIZE_T_MAX - outsize) {
7230 PyErr_NoMemory();
7231 goto error;
7232 }
7233 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7234 goto error;
7235 out = PyBytes_AS_STRING(*outbytes) + n;
7236 }
7237
7238 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007239 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007241 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7242 wchar_t chars[2];
7243 int charsize;
7244 if (ch < 0x10000) {
7245 chars[0] = (wchar_t)ch;
7246 charsize = 1;
7247 }
7248 else {
7249 ch -= 0x10000;
7250 chars[0] = 0xd800 + (ch >> 10);
7251 chars[1] = 0xdc00 + (ch & 0x3ff);
7252 charsize = 2;
7253 }
7254
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007256 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 buffer, Py_ARRAY_LENGTH(buffer),
7258 NULL, pusedDefaultChar);
7259 if (outsize > 0) {
7260 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7261 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007262 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 memcpy(out, buffer, outsize);
7264 out += outsize;
7265 continue;
7266 }
7267 }
7268 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7269 PyErr_SetFromWindowsErr(0);
7270 goto error;
7271 }
7272
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 rep = unicode_encode_call_errorhandler(
7274 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007275 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007276 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 if (rep == NULL)
7278 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007279 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007280
7281 if (PyBytes_Check(rep)) {
7282 outsize = PyBytes_GET_SIZE(rep);
7283 if (outsize != 1) {
7284 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7285 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7286 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7287 Py_DECREF(rep);
7288 goto error;
7289 }
7290 out = PyBytes_AS_STRING(*outbytes) + offset;
7291 }
7292 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7293 out += outsize;
7294 }
7295 else {
7296 Py_ssize_t i;
7297 enum PyUnicode_Kind kind;
7298 void *data;
7299
Benjamin Petersonbac79492012-01-14 13:34:47 -05007300 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 Py_DECREF(rep);
7302 goto error;
7303 }
7304
7305 outsize = PyUnicode_GET_LENGTH(rep);
7306 if (outsize != 1) {
7307 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7308 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7309 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7310 Py_DECREF(rep);
7311 goto error;
7312 }
7313 out = PyBytes_AS_STRING(*outbytes) + offset;
7314 }
7315 kind = PyUnicode_KIND(rep);
7316 data = PyUnicode_DATA(rep);
7317 for (i=0; i < outsize; i++) {
7318 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7319 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007320 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007321 encoding, unicode,
7322 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 "unable to encode error handler result to ASCII");
7324 Py_DECREF(rep);
7325 goto error;
7326 }
7327 *out = (unsigned char)ch;
7328 out++;
7329 }
7330 }
7331 Py_DECREF(rep);
7332 }
7333 /* write a NUL byte */
7334 *out = 0;
7335 outsize = out - PyBytes_AS_STRING(*outbytes);
7336 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7337 if (_PyBytes_Resize(outbytes, outsize) < 0)
7338 goto error;
7339 ret = 0;
7340
7341error:
7342 Py_XDECREF(encoding_obj);
7343 Py_XDECREF(errorHandler);
7344 Py_XDECREF(exc);
7345 return ret;
7346}
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348static PyObject *
7349encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 const char *errors)
7352{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007353 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007355 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007356 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007357
Benjamin Petersonbac79492012-01-14 13:34:47 -05007358 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007359 return NULL;
7360 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007361
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 if (code_page < 0) {
7363 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7364 return NULL;
7365 }
7366
Martin v. Löwis3d325192011-11-04 18:23:06 +01007367 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007368 return PyBytes_FromStringAndSize(NULL, 0);
7369
Victor Stinner7581cef2011-11-03 22:32:33 +01007370 offset = 0;
7371 do
7372 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007374 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007375 chunks. */
7376 if (len > INT_MAX/2) {
7377 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007378 done = 0;
7379 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007380 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007383 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007384 done = 1;
7385 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007386
Victor Stinner76a31a62011-11-04 00:05:13 +01007387 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007388 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 errors);
7390 if (ret == -2)
7391 ret = encode_code_page_errors(code_page, &outbytes,
7392 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007393 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007394 if (ret < 0) {
7395 Py_XDECREF(outbytes);
7396 return NULL;
7397 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398
Victor Stinner7581cef2011-11-03 22:32:33 +01007399 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007400 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007401 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007402
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 return outbytes;
7404}
7405
7406PyObject *
7407PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7408 Py_ssize_t size,
7409 const char *errors)
7410{
Victor Stinner7581cef2011-11-03 22:32:33 +01007411 PyObject *unicode, *res;
7412 unicode = PyUnicode_FromUnicode(p, size);
7413 if (unicode == NULL)
7414 return NULL;
7415 res = encode_code_page(CP_ACP, unicode, errors);
7416 Py_DECREF(unicode);
7417 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007418}
7419
7420PyObject *
7421PyUnicode_EncodeCodePage(int code_page,
7422 PyObject *unicode,
7423 const char *errors)
7424{
Victor Stinner7581cef2011-11-03 22:32:33 +01007425 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007426}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007427
Alexander Belopolsky40018472011-02-26 01:02:56 +00007428PyObject *
7429PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007430{
7431 if (!PyUnicode_Check(unicode)) {
7432 PyErr_BadArgument();
7433 return NULL;
7434 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007435 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007436}
7437
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438#undef NEED_RETRY
7439
Victor Stinner99b95382011-07-04 14:23:54 +02007440#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007441
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442/* --- Character Mapping Codec -------------------------------------------- */
7443
Alexander Belopolsky40018472011-02-26 01:02:56 +00007444PyObject *
7445PyUnicode_DecodeCharmap(const char *s,
7446 Py_ssize_t size,
7447 PyObject *mapping,
7448 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007451 Py_ssize_t startinpos;
7452 Py_ssize_t endinpos;
7453 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007454 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007455 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007456 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007457 PyObject *errorHandler = NULL;
7458 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007459
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 /* Default to Latin-1 */
7461 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007464 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007468 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007469 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007471 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007472 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007473 enum PyUnicode_Kind mapkind;
7474 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007475 Py_UCS4 x;
7476
Benjamin Petersonbac79492012-01-14 13:34:47 -05007477 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007478 return NULL;
7479
7480 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007481 mapdata = PyUnicode_DATA(mapping);
7482 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007484 unsigned char ch;
7485 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7486 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7487 if (outkind == PyUnicode_1BYTE_KIND) {
7488 void *outdata = PyUnicode_DATA(v);
7489 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7490 while (s < e) {
7491 unsigned char ch = *s;
7492 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7493 if (x > maxchar)
7494 goto Error;
7495 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7496 ++s;
7497 }
7498 break;
7499 }
7500 else if (outkind == PyUnicode_2BYTE_KIND) {
7501 void *outdata = PyUnicode_DATA(v);
7502 while (s < e) {
7503 unsigned char ch = *s;
7504 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7505 if (x == 0xFFFE)
7506 goto Error;
7507 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7508 ++s;
7509 }
7510 break;
7511 }
7512 }
7513 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007516 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007517 else
7518 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007519Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007520 if (x == 0xfffe)
7521 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 startinpos = s-starts;
7524 endinpos = startinpos+1;
7525 if (unicode_decode_call_errorhandler(
7526 errors, &errorHandler,
7527 "charmap", "character maps to <undefined>",
7528 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007529 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 goto onError;
7531 }
7532 continue;
7533 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007534
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007535 if (unicode_putchar(&v, &outpos, x) < 0)
7536 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007538 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007539 }
7540 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 while (s < e) {
7542 unsigned char ch = *s;
7543 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007544
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7546 w = PyLong_FromLong((long)ch);
7547 if (w == NULL)
7548 goto onError;
7549 x = PyObject_GetItem(mapping, w);
7550 Py_DECREF(w);
7551 if (x == NULL) {
7552 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7553 /* No mapping found means: mapping is undefined. */
7554 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007555 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 } else
7557 goto onError;
7558 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007559
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007561 if (x == Py_None)
7562 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 if (PyLong_Check(x)) {
7564 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007565 if (value == 0xFFFE)
7566 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007567 if (value < 0 || value > MAX_UNICODE) {
7568 PyErr_Format(PyExc_TypeError,
7569 "character mapping must be in range(0x%lx)",
7570 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 Py_DECREF(x);
7572 goto onError;
7573 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007574 if (unicode_putchar(&v, &outpos, value) < 0) {
7575 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007576 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007577 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007580 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007581
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007582 if (PyUnicode_READY(x) == -1) {
7583 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007584 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007585 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007586 targetsize = PyUnicode_GET_LENGTH(x);
7587
7588 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007590 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007591 if (value == 0xFFFE)
7592 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007593 if (unicode_putchar(&v, &outpos, value) < 0) {
7594 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007595 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007596 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 else if (targetsize > 1) {
7599 /* 1-n mapping */
7600 if (targetsize > extrachars) {
7601 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 Py_ssize_t needed = (targetsize - extrachars) + \
7603 (targetsize << 2);
7604 extrachars += needed;
7605 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007606 if (unicode_resize(&v,
7607 PyUnicode_GET_LENGTH(v) + needed) < 0)
7608 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 Py_DECREF(x);
7610 goto onError;
7611 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007613 if (unicode_widen(&v, outpos,
7614 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7615 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007616 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007617 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007618 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7619 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 extrachars -= targetsize;
7621 }
7622 /* 1-0 mapping: skip the character */
7623 }
7624 else {
7625 /* wrong return value */
7626 PyErr_SetString(PyExc_TypeError,
7627 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 Py_DECREF(x);
7629 goto onError;
7630 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 Py_DECREF(x);
7632 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007633 continue;
7634Undefined:
7635 /* undefined mapping */
7636 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007637 startinpos = s-starts;
7638 endinpos = startinpos+1;
7639 if (unicode_decode_call_errorhandler(
7640 errors, &errorHandler,
7641 "charmap", "character maps to <undefined>",
7642 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007643 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007644 goto onError;
7645 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007648 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007649 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 Py_XDECREF(errorHandler);
7651 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007652 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007653
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007655 Py_XDECREF(errorHandler);
7656 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657 Py_XDECREF(v);
7658 return NULL;
7659}
7660
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007661/* Charmap encoding: the lookup table */
7662
Alexander Belopolsky40018472011-02-26 01:02:56 +00007663struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 PyObject_HEAD
7665 unsigned char level1[32];
7666 int count2, count3;
7667 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007668};
7669
7670static PyObject*
7671encoding_map_size(PyObject *obj, PyObject* args)
7672{
7673 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007674 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676}
7677
7678static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007679 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 PyDoc_STR("Return the size (in bytes) of this object") },
7681 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007682};
7683
7684static void
7685encoding_map_dealloc(PyObject* o)
7686{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007687 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688}
7689
7690static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007691 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 "EncodingMap", /*tp_name*/
7693 sizeof(struct encoding_map), /*tp_basicsize*/
7694 0, /*tp_itemsize*/
7695 /* methods */
7696 encoding_map_dealloc, /*tp_dealloc*/
7697 0, /*tp_print*/
7698 0, /*tp_getattr*/
7699 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007700 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 0, /*tp_repr*/
7702 0, /*tp_as_number*/
7703 0, /*tp_as_sequence*/
7704 0, /*tp_as_mapping*/
7705 0, /*tp_hash*/
7706 0, /*tp_call*/
7707 0, /*tp_str*/
7708 0, /*tp_getattro*/
7709 0, /*tp_setattro*/
7710 0, /*tp_as_buffer*/
7711 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7712 0, /*tp_doc*/
7713 0, /*tp_traverse*/
7714 0, /*tp_clear*/
7715 0, /*tp_richcompare*/
7716 0, /*tp_weaklistoffset*/
7717 0, /*tp_iter*/
7718 0, /*tp_iternext*/
7719 encoding_map_methods, /*tp_methods*/
7720 0, /*tp_members*/
7721 0, /*tp_getset*/
7722 0, /*tp_base*/
7723 0, /*tp_dict*/
7724 0, /*tp_descr_get*/
7725 0, /*tp_descr_set*/
7726 0, /*tp_dictoffset*/
7727 0, /*tp_init*/
7728 0, /*tp_alloc*/
7729 0, /*tp_new*/
7730 0, /*tp_free*/
7731 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732};
7733
7734PyObject*
7735PyUnicode_BuildEncodingMap(PyObject* string)
7736{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007737 PyObject *result;
7738 struct encoding_map *mresult;
7739 int i;
7740 int need_dict = 0;
7741 unsigned char level1[32];
7742 unsigned char level2[512];
7743 unsigned char *mlevel1, *mlevel2, *mlevel3;
7744 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007745 int kind;
7746 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007747 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007748 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007749
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007750 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007751 PyErr_BadArgument();
7752 return NULL;
7753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007754 kind = PyUnicode_KIND(string);
7755 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007756 length = PyUnicode_GET_LENGTH(string);
7757 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758 memset(level1, 0xFF, sizeof level1);
7759 memset(level2, 0xFF, sizeof level2);
7760
7761 /* If there isn't a one-to-one mapping of NULL to \0,
7762 or if there are non-BMP characters, we need to use
7763 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007764 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007765 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007766 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007768 ch = PyUnicode_READ(kind, data, i);
7769 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007770 need_dict = 1;
7771 break;
7772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007773 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774 /* unmapped character */
7775 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007776 l1 = ch >> 11;
7777 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007778 if (level1[l1] == 0xFF)
7779 level1[l1] = count2++;
7780 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782 }
7783
7784 if (count2 >= 0xFF || count3 >= 0xFF)
7785 need_dict = 1;
7786
7787 if (need_dict) {
7788 PyObject *result = PyDict_New();
7789 PyObject *key, *value;
7790 if (!result)
7791 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007792 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007794 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795 if (!key || !value)
7796 goto failed1;
7797 if (PyDict_SetItem(result, key, value) == -1)
7798 goto failed1;
7799 Py_DECREF(key);
7800 Py_DECREF(value);
7801 }
7802 return result;
7803 failed1:
7804 Py_XDECREF(key);
7805 Py_XDECREF(value);
7806 Py_DECREF(result);
7807 return NULL;
7808 }
7809
7810 /* Create a three-level trie */
7811 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7812 16*count2 + 128*count3 - 1);
7813 if (!result)
7814 return PyErr_NoMemory();
7815 PyObject_Init(result, &EncodingMapType);
7816 mresult = (struct encoding_map*)result;
7817 mresult->count2 = count2;
7818 mresult->count3 = count3;
7819 mlevel1 = mresult->level1;
7820 mlevel2 = mresult->level23;
7821 mlevel3 = mresult->level23 + 16*count2;
7822 memcpy(mlevel1, level1, 32);
7823 memset(mlevel2, 0xFF, 16*count2);
7824 memset(mlevel3, 0, 128*count3);
7825 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007826 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007828 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7829 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 /* unmapped character */
7831 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007832 o1 = ch>>11;
7833 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834 i2 = 16*mlevel1[o1] + o2;
7835 if (mlevel2[i2] == 0xFF)
7836 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007837 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007838 i3 = 128*mlevel2[i2] + o3;
7839 mlevel3[i3] = i;
7840 }
7841 return result;
7842}
7843
7844static int
Victor Stinner22168992011-11-20 17:09:18 +01007845encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846{
7847 struct encoding_map *map = (struct encoding_map*)mapping;
7848 int l1 = c>>11;
7849 int l2 = (c>>7) & 0xF;
7850 int l3 = c & 0x7F;
7851 int i;
7852
Victor Stinner22168992011-11-20 17:09:18 +01007853 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007855 if (c == 0)
7856 return 0;
7857 /* level 1*/
7858 i = map->level1[l1];
7859 if (i == 0xFF) {
7860 return -1;
7861 }
7862 /* level 2*/
7863 i = map->level23[16*i+l2];
7864 if (i == 0xFF) {
7865 return -1;
7866 }
7867 /* level 3 */
7868 i = map->level23[16*map->count2 + 128*i + l3];
7869 if (i == 0) {
7870 return -1;
7871 }
7872 return i;
7873}
7874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875/* Lookup the character ch in the mapping. If the character
7876 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007877 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007878static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007879charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880{
Christian Heimes217cfd12007-12-02 14:31:20 +00007881 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007882 PyObject *x;
7883
7884 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007886 x = PyObject_GetItem(mapping, w);
7887 Py_DECREF(w);
7888 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7890 /* No mapping found means: mapping is undefined. */
7891 PyErr_Clear();
7892 x = Py_None;
7893 Py_INCREF(x);
7894 return x;
7895 } else
7896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007898 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007900 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 long value = PyLong_AS_LONG(x);
7902 if (value < 0 || value > 255) {
7903 PyErr_SetString(PyExc_TypeError,
7904 "character mapping must be in range(256)");
7905 Py_DECREF(x);
7906 return NULL;
7907 }
7908 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007910 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 /* wrong return value */
7914 PyErr_Format(PyExc_TypeError,
7915 "character mapping must return integer, bytes or None, not %.400s",
7916 x->ob_type->tp_name);
7917 Py_DECREF(x);
7918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 }
7920}
7921
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007923charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007925 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7926 /* exponentially overallocate to minimize reallocations */
7927 if (requiredsize < 2*outsize)
7928 requiredsize = 2*outsize;
7929 if (_PyBytes_Resize(outobj, requiredsize))
7930 return -1;
7931 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932}
7933
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007936} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007937/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007938 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007939 space is available. Return a new reference to the object that
7940 was put in the output buffer, or Py_None, if the mapping was undefined
7941 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007942 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007943static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007944charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007945 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 PyObject *rep;
7948 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007949 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007950
Christian Heimes90aa7642007-12-19 02:45:37 +00007951 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007952 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007954 if (res == -1)
7955 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 if (outsize<requiredsize)
7957 if (charmapencode_resize(outobj, outpos, requiredsize))
7958 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007959 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 outstart[(*outpos)++] = (char)res;
7961 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007962 }
7963
7964 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007965 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007967 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 Py_DECREF(rep);
7969 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 if (PyLong_Check(rep)) {
7972 Py_ssize_t requiredsize = *outpos+1;
7973 if (outsize<requiredsize)
7974 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7975 Py_DECREF(rep);
7976 return enc_EXCEPTION;
7977 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007978 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 else {
7982 const char *repchars = PyBytes_AS_STRING(rep);
7983 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7984 Py_ssize_t requiredsize = *outpos+repsize;
7985 if (outsize<requiredsize)
7986 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7987 Py_DECREF(rep);
7988 return enc_EXCEPTION;
7989 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007990 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 memcpy(outstart + *outpos, repchars, repsize);
7992 *outpos += repsize;
7993 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007994 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007995 Py_DECREF(rep);
7996 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997}
7998
7999/* handle an error in PyUnicode_EncodeCharmap
8000 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008001static int
8002charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008003 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008004 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008005 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008006 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007{
8008 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008009 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008011 enum PyUnicode_Kind kind;
8012 void *data;
8013 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008014 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015 Py_ssize_t collstartpos = *inpos;
8016 Py_ssize_t collendpos = *inpos+1;
8017 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018 char *encoding = "charmap";
8019 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008020 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008021 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008022 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008023
Benjamin Petersonbac79492012-01-14 13:34:47 -05008024 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008025 return -1;
8026 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008027 /* find all unencodable characters */
8028 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008030 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008031 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008032 val = encoding_map_lookup(ch, mapping);
8033 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 break;
8035 ++collendpos;
8036 continue;
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008039 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8040 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 if (rep==NULL)
8042 return -1;
8043 else if (rep!=Py_None) {
8044 Py_DECREF(rep);
8045 break;
8046 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008049 }
8050 /* cache callback name lookup
8051 * (if not done yet, i.e. it's the first error) */
8052 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 if ((errors==NULL) || (!strcmp(errors, "strict")))
8054 *known_errorHandler = 1;
8055 else if (!strcmp(errors, "replace"))
8056 *known_errorHandler = 2;
8057 else if (!strcmp(errors, "ignore"))
8058 *known_errorHandler = 3;
8059 else if (!strcmp(errors, "xmlcharrefreplace"))
8060 *known_errorHandler = 4;
8061 else
8062 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063 }
8064 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008066 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 return -1;
8068 case 2: /* replace */
8069 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 x = charmapencode_output('?', mapping, res, respos);
8071 if (x==enc_EXCEPTION) {
8072 return -1;
8073 }
8074 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008075 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 return -1;
8077 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008078 }
8079 /* fall through */
8080 case 3: /* ignore */
8081 *inpos = collendpos;
8082 break;
8083 case 4: /* xmlcharrefreplace */
8084 /* generate replacement (temporarily (mis)uses p) */
8085 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 char buffer[2+29+1+1];
8087 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008088 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 for (cp = buffer; *cp; ++cp) {
8090 x = charmapencode_output(*cp, mapping, res, respos);
8091 if (x==enc_EXCEPTION)
8092 return -1;
8093 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008094 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 return -1;
8096 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 }
8098 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 *inpos = collendpos;
8100 break;
8101 default:
8102 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008107 if (PyBytes_Check(repunicode)) {
8108 /* Directly copy bytes result to output. */
8109 Py_ssize_t outsize = PyBytes_Size(*res);
8110 Py_ssize_t requiredsize;
8111 repsize = PyBytes_Size(repunicode);
8112 requiredsize = *respos + repsize;
8113 if (requiredsize > outsize)
8114 /* Make room for all additional bytes. */
8115 if (charmapencode_resize(res, respos, requiredsize)) {
8116 Py_DECREF(repunicode);
8117 return -1;
8118 }
8119 memcpy(PyBytes_AsString(*res) + *respos,
8120 PyBytes_AsString(repunicode), repsize);
8121 *respos += repsize;
8122 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008123 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008124 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008125 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008127 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008128 Py_DECREF(repunicode);
8129 return -1;
8130 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008131 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008132 data = PyUnicode_DATA(repunicode);
8133 kind = PyUnicode_KIND(repunicode);
8134 for (index = 0; index < repsize; index++) {
8135 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8136 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008138 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 return -1;
8140 }
8141 else if (x==enc_FAILED) {
8142 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008143 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 return -1;
8145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 }
8147 *inpos = newpos;
8148 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 }
8150 return 0;
8151}
8152
Alexander Belopolsky40018472011-02-26 01:02:56 +00008153PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008154_PyUnicode_EncodeCharmap(PyObject *unicode,
8155 PyObject *mapping,
8156 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 /* output object */
8159 PyObject *res = NULL;
8160 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008163 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008164 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 PyObject *errorHandler = NULL;
8166 PyObject *exc = NULL;
8167 /* the following variable is used for caching string comparisons
8168 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8169 * 3=ignore, 4=xmlcharrefreplace */
8170 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171
Benjamin Petersonbac79492012-01-14 13:34:47 -05008172 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008173 return NULL;
8174 size = PyUnicode_GET_LENGTH(unicode);
8175
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 /* Default to Latin-1 */
8177 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008180 /* allocate enough for a simple encoding without
8181 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008182 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008183 if (res == NULL)
8184 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008185 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008188 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008189 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008191 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 if (x==enc_EXCEPTION) /* error */
8193 goto onError;
8194 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008195 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 &exc,
8197 &known_errorHandler, &errorHandler, errors,
8198 &res, &respos)) {
8199 goto onError;
8200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 else
8203 /* done with this character => adjust input position */
8204 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008208 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008209 if (_PyBytes_Resize(&res, respos) < 0)
8210 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 Py_XDECREF(exc);
8213 Py_XDECREF(errorHandler);
8214 return res;
8215
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217 Py_XDECREF(res);
8218 Py_XDECREF(exc);
8219 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 return NULL;
8221}
8222
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008223/* Deprecated */
8224PyObject *
8225PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8226 Py_ssize_t size,
8227 PyObject *mapping,
8228 const char *errors)
8229{
8230 PyObject *result;
8231 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8232 if (unicode == NULL)
8233 return NULL;
8234 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8235 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008236 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008237}
8238
Alexander Belopolsky40018472011-02-26 01:02:56 +00008239PyObject *
8240PyUnicode_AsCharmapString(PyObject *unicode,
8241 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242{
8243 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 PyErr_BadArgument();
8245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008247 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248}
8249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008251static void
8252make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008254 Py_ssize_t startpos, Py_ssize_t endpos,
8255 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 *exceptionObject = _PyUnicodeTranslateError_Create(
8259 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
8261 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8263 goto onError;
8264 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8265 goto onError;
8266 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8267 goto onError;
8268 return;
8269 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008270 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
8272}
8273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008275static void
8276raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008278 Py_ssize_t startpos, Py_ssize_t endpos,
8279 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280{
8281 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008282 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285}
8286
8287/* error handling callback helper:
8288 build arguments, call the callback and check the arguments,
8289 put the result into newpos and return the replacement string, which
8290 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008291static PyObject *
8292unicode_translate_call_errorhandler(const char *errors,
8293 PyObject **errorHandler,
8294 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008296 Py_ssize_t startpos, Py_ssize_t endpos,
8297 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008299 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008301 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 PyObject *restuple;
8303 PyObject *resunicode;
8304
8305 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 }
8310
8311 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315
8316 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008321 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 Py_DECREF(restuple);
8323 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 }
8325 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 &resunicode, &i_newpos)) {
8327 Py_DECREF(restuple);
8328 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008330 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008332 else
8333 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8336 Py_DECREF(restuple);
8337 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008338 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 Py_INCREF(resunicode);
8340 Py_DECREF(restuple);
8341 return resunicode;
8342}
8343
8344/* Lookup the character ch in the mapping and put the result in result,
8345 which must be decrefed by the caller.
8346 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349{
Christian Heimes217cfd12007-12-02 14:31:20 +00008350 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 PyObject *x;
8352
8353 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 x = PyObject_GetItem(mapping, w);
8356 Py_DECREF(w);
8357 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8359 /* No mapping found means: use 1:1 mapping. */
8360 PyErr_Clear();
8361 *result = NULL;
8362 return 0;
8363 } else
8364 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 }
8366 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 *result = x;
8368 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008370 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 long value = PyLong_AS_LONG(x);
8372 long max = PyUnicode_GetMax();
8373 if (value < 0 || value > max) {
8374 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008375 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 Py_DECREF(x);
8377 return -1;
8378 }
8379 *result = x;
8380 return 0;
8381 }
8382 else if (PyUnicode_Check(x)) {
8383 *result = x;
8384 return 0;
8385 }
8386 else {
8387 /* wrong return value */
8388 PyErr_SetString(PyExc_TypeError,
8389 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 Py_DECREF(x);
8391 return -1;
8392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393}
8394/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 if not reallocate and adjust various state variables.
8396 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008397static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008402 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008403 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 /* exponentially overallocate to minimize reallocations */
8405 if (requiredsize < 2 * oldsize)
8406 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008407 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8408 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008410 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
8413 return 0;
8414}
8415/* lookup the character, put the result in the output string and adjust
8416 various state variables. Return a new reference to the object that
8417 was put in the output buffer in *result, or Py_None, if the mapping was
8418 undefined (in which case no character was written).
8419 The called must decref result.
8420 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008421static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8423 PyObject *mapping, Py_UCS4 **output,
8424 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8428 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
8434 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008436 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 }
8440 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441 Py_ssize_t repsize;
8442 if (PyUnicode_READY(*res) == -1)
8443 return -1;
8444 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 if (repsize==1) {
8446 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 }
8449 else if (repsize!=0) {
8450 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 Py_ssize_t requiredsize = *opos +
8452 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 Py_ssize_t i;
8455 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 for(i = 0; i < repsize; i++)
8458 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 }
8461 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 return 0;
8464}
8465
Alexander Belopolsky40018472011-02-26 01:02:56 +00008466PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467_PyUnicode_TranslateCharmap(PyObject *input,
8468 PyObject *mapping,
8469 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 /* input object */
8472 char *idata;
8473 Py_ssize_t size, i;
8474 int kind;
8475 /* output buffer */
8476 Py_UCS4 *output = NULL;
8477 Py_ssize_t osize;
8478 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481 char *reason = "character maps to <undefined>";
8482 PyObject *errorHandler = NULL;
8483 PyObject *exc = NULL;
8484 /* the following variable is used for caching string comparisons
8485 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8486 * 3=ignore, 4=xmlcharrefreplace */
8487 int known_errorHandler = -1;
8488
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 PyErr_BadArgument();
8491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 if (PyUnicode_READY(input) == -1)
8495 return NULL;
8496 idata = (char*)PyUnicode_DATA(input);
8497 kind = PyUnicode_KIND(input);
8498 size = PyUnicode_GET_LENGTH(input);
8499 i = 0;
8500
8501 if (size == 0) {
8502 Py_INCREF(input);
8503 return input;
8504 }
8505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 /* allocate enough for a simple 1:1 translation without
8507 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 osize = size;
Benjamin Petersone5a853c2015-03-02 13:23:25 -05008509 output = PyMem_NEW(Py_UCS4, osize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 opos = 0;
8511 if (output == NULL) {
8512 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 /* try to encode it */
8518 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 if (charmaptranslate_output(input, i, mapping,
8520 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 Py_XDECREF(x);
8522 goto onError;
8523 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008524 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 else { /* untranslatable character */
8528 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8529 Py_ssize_t repsize;
8530 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 Py_ssize_t collstart = i;
8534 Py_ssize_t collend = i+1;
8535 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 while (collend < size) {
8539 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 goto onError;
8541 Py_XDECREF(x);
8542 if (x!=Py_None)
8543 break;
8544 ++collend;
8545 }
8546 /* cache callback name lookup
8547 * (if not done yet, i.e. it's the first error) */
8548 if (known_errorHandler==-1) {
8549 if ((errors==NULL) || (!strcmp(errors, "strict")))
8550 known_errorHandler = 1;
8551 else if (!strcmp(errors, "replace"))
8552 known_errorHandler = 2;
8553 else if (!strcmp(errors, "ignore"))
8554 known_errorHandler = 3;
8555 else if (!strcmp(errors, "xmlcharrefreplace"))
8556 known_errorHandler = 4;
8557 else
8558 known_errorHandler = 0;
8559 }
8560 switch (known_errorHandler) {
8561 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 raise_translate_exception(&exc, input, collstart,
8563 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 case 2: /* replace */
8566 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 for (coll = collstart; coll<collend; coll++)
8568 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 /* fall through */
8570 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 break;
8573 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 /* generate replacement (temporarily (mis)uses i) */
8575 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 char buffer[2+29+1+1];
8577 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8579 if (charmaptranslate_makespace(&output, &osize,
8580 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 goto onError;
8582 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 break;
8587 default:
8588 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 reason, input, &exc,
8590 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008591 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008593 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008594 Py_DECREF(repunicode);
8595 goto onError;
8596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 repsize = PyUnicode_GET_LENGTH(repunicode);
8599 if (charmaptranslate_makespace(&output, &osize,
8600 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 Py_DECREF(repunicode);
8602 goto onError;
8603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 for (uni2 = 0; repsize-->0; ++uni2)
8605 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8606 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008608 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008609 }
8610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8612 if (!res)
8613 goto onError;
8614 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008615 Py_XDECREF(exc);
8616 Py_XDECREF(errorHandler);
8617 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 Py_XDECREF(exc);
8622 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 return NULL;
8624}
8625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626/* Deprecated. Use PyUnicode_Translate instead. */
8627PyObject *
8628PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8629 Py_ssize_t size,
8630 PyObject *mapping,
8631 const char *errors)
8632{
Christian Heimes5f520f42012-09-11 14:03:25 +02008633 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8635 if (!unicode)
8636 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008637 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8638 Py_DECREF(unicode);
8639 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640}
8641
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642PyObject *
8643PyUnicode_Translate(PyObject *str,
8644 PyObject *mapping,
8645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646{
8647 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008648
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 str = PyUnicode_FromObject(str);
8650 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008651 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 Py_DECREF(str);
8654 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655}
Tim Petersced69f82003-09-16 20:30:58 +00008656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008658fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659{
8660 /* No need to call PyUnicode_READY(self) because this function is only
8661 called as a callback from fixup() which does it already. */
8662 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8663 const int kind = PyUnicode_KIND(self);
8664 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008665 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008666 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 Py_ssize_t i;
8668
8669 for (i = 0; i < len; ++i) {
8670 ch = PyUnicode_READ(kind, data, i);
8671 fixed = 0;
8672 if (ch > 127) {
8673 if (Py_UNICODE_ISSPACE(ch))
8674 fixed = ' ';
8675 else {
8676 const int decimal = Py_UNICODE_TODECIMAL(ch);
8677 if (decimal >= 0)
8678 fixed = '0' + decimal;
8679 }
8680 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008681 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008682 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 PyUnicode_WRITE(kind, data, i, fixed);
8684 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008685 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008686 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 }
8689
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008690 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691}
8692
8693PyObject *
8694_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8695{
8696 if (!PyUnicode_Check(unicode)) {
8697 PyErr_BadInternalCall();
8698 return NULL;
8699 }
8700 if (PyUnicode_READY(unicode) == -1)
8701 return NULL;
8702 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8703 /* If the string is already ASCII, just return the same string */
8704 Py_INCREF(unicode);
8705 return unicode;
8706 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008707 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708}
8709
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008710PyObject *
8711PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8712 Py_ssize_t length)
8713{
Victor Stinnerf0124502011-11-21 23:12:56 +01008714 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008715 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008716 Py_UCS4 maxchar;
8717 enum PyUnicode_Kind kind;
8718 void *data;
8719
Victor Stinner99d7ad02012-02-22 13:37:39 +01008720 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008721 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008722 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008723 if (ch > 127) {
8724 int decimal = Py_UNICODE_TODECIMAL(ch);
8725 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008726 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008727 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008728 }
8729 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008730
8731 /* Copy to a new string */
8732 decimal = PyUnicode_New(length, maxchar);
8733 if (decimal == NULL)
8734 return decimal;
8735 kind = PyUnicode_KIND(decimal);
8736 data = PyUnicode_DATA(decimal);
8737 /* Iterate over code points */
8738 for (i = 0; i < length; i++) {
8739 Py_UNICODE ch = s[i];
8740 if (ch > 127) {
8741 int decimal = Py_UNICODE_TODECIMAL(ch);
8742 if (decimal >= 0)
8743 ch = '0' + decimal;
8744 }
8745 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008747 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008748}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008749/* --- Decimal Encoder ---------------------------------------------------- */
8750
Alexander Belopolsky40018472011-02-26 01:02:56 +00008751int
8752PyUnicode_EncodeDecimal(Py_UNICODE *s,
8753 Py_ssize_t length,
8754 char *output,
8755 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008756{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008757 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008758 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008759 enum PyUnicode_Kind kind;
8760 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008761
8762 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 PyErr_BadArgument();
8764 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008765 }
8766
Victor Stinner42bf7752011-11-21 22:52:58 +01008767 unicode = PyUnicode_FromUnicode(s, length);
8768 if (unicode == NULL)
8769 return -1;
8770
Benjamin Petersonbac79492012-01-14 13:34:47 -05008771 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008772 Py_DECREF(unicode);
8773 return -1;
8774 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008775 kind = PyUnicode_KIND(unicode);
8776 data = PyUnicode_DATA(unicode);
8777
Victor Stinnerb84d7232011-11-22 01:50:07 +01008778 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008779 PyObject *exc;
8780 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008782 Py_ssize_t startpos;
8783
8784 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008785
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008787 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008788 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 decimal = Py_UNICODE_TODECIMAL(ch);
8792 if (decimal >= 0) {
8793 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008794 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 continue;
8796 }
8797 if (0 < ch && ch < 256) {
8798 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008799 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 continue;
8801 }
Victor Stinner6345be92011-11-25 20:09:01 +01008802
Victor Stinner42bf7752011-11-21 22:52:58 +01008803 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008804 exc = NULL;
8805 raise_encode_exception(&exc, "decimal", unicode,
8806 startpos, startpos+1,
8807 "invalid decimal Unicode string");
8808 Py_XDECREF(exc);
8809 Py_DECREF(unicode);
8810 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008811 }
8812 /* 0-terminate the output string */
8813 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008814 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008815 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008816}
8817
Guido van Rossumd57fd912000-03-10 22:53:23 +00008818/* --- Helpers ------------------------------------------------------------ */
8819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008821any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 Py_ssize_t start,
8823 Py_ssize_t end)
8824{
8825 int kind1, kind2, kind;
8826 void *buf1, *buf2;
8827 Py_ssize_t len1, len2, result;
8828
8829 kind1 = PyUnicode_KIND(s1);
8830 kind2 = PyUnicode_KIND(s2);
8831 kind = kind1 > kind2 ? kind1 : kind2;
8832 buf1 = PyUnicode_DATA(s1);
8833 buf2 = PyUnicode_DATA(s2);
8834 if (kind1 != kind)
8835 buf1 = _PyUnicode_AsKind(s1, kind);
8836 if (!buf1)
8837 return -2;
8838 if (kind2 != kind)
8839 buf2 = _PyUnicode_AsKind(s2, kind);
8840 if (!buf2) {
8841 if (kind1 != kind) PyMem_Free(buf1);
8842 return -2;
8843 }
8844 len1 = PyUnicode_GET_LENGTH(s1);
8845 len2 = PyUnicode_GET_LENGTH(s2);
8846
Victor Stinner794d5672011-10-10 03:21:36 +02008847 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008848 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008849 case PyUnicode_1BYTE_KIND:
8850 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8851 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8852 else
8853 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8854 break;
8855 case PyUnicode_2BYTE_KIND:
8856 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8857 break;
8858 case PyUnicode_4BYTE_KIND:
8859 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8860 break;
8861 default:
8862 assert(0); result = -2;
8863 }
8864 }
8865 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008866 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008867 case PyUnicode_1BYTE_KIND:
8868 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8869 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8870 else
8871 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8872 break;
8873 case PyUnicode_2BYTE_KIND:
8874 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8875 break;
8876 case PyUnicode_4BYTE_KIND:
8877 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8878 break;
8879 default:
8880 assert(0); result = -2;
8881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 }
8883
8884 if (kind1 != kind)
8885 PyMem_Free(buf1);
8886 if (kind2 != kind)
8887 PyMem_Free(buf2);
8888
8889 return result;
8890}
8891
8892Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008893_PyUnicode_InsertThousandsGrouping(
8894 PyObject *unicode, Py_ssize_t index,
8895 Py_ssize_t n_buffer,
8896 void *digits, Py_ssize_t n_digits,
8897 Py_ssize_t min_width,
8898 const char *grouping, PyObject *thousands_sep,
8899 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900{
Victor Stinner41a863c2012-02-24 00:37:51 +01008901 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008902 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008903 Py_ssize_t thousands_sep_len;
8904 Py_ssize_t len;
8905
8906 if (unicode != NULL) {
8907 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008908 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008909 }
8910 else {
8911 kind = PyUnicode_1BYTE_KIND;
8912 data = NULL;
8913 }
8914 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8915 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8916 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8917 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008918 if (thousands_sep_kind < kind) {
8919 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8920 if (!thousands_sep_data)
8921 return -1;
8922 }
8923 else {
8924 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8925 if (!data)
8926 return -1;
8927 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008928 }
8929
Benjamin Petersonead6b532011-12-20 17:23:42 -06008930 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008932 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008933 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008934 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008935 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008936 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008937 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008938 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008939 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008940 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008941 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008942 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008944 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008945 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008946 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008947 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008948 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008950 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008951 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008952 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008953 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008954 break;
8955 default:
8956 assert(0);
8957 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008959 if (unicode != NULL && thousands_sep_kind != kind) {
8960 if (thousands_sep_kind < kind)
8961 PyMem_Free(thousands_sep_data);
8962 else
8963 PyMem_Free(data);
8964 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008965 if (unicode == NULL) {
8966 *maxchar = 127;
8967 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008968 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008969 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008970 }
8971 }
8972 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973}
8974
8975
Thomas Wouters477c8d52006-05-27 19:21:47 +00008976/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008977#define ADJUST_INDICES(start, end, len) \
8978 if (end > len) \
8979 end = len; \
8980 else if (end < 0) { \
8981 end += len; \
8982 if (end < 0) \
8983 end = 0; \
8984 } \
8985 if (start < 0) { \
8986 start += len; \
8987 if (start < 0) \
8988 start = 0; \
8989 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008990
Alexander Belopolsky40018472011-02-26 01:02:56 +00008991Py_ssize_t
8992PyUnicode_Count(PyObject *str,
8993 PyObject *substr,
8994 Py_ssize_t start,
8995 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008997 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008998 PyObject* str_obj;
8999 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 int kind1, kind2, kind;
9001 void *buf1 = NULL, *buf2 = NULL;
9002 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009003
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009004 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009005 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009007 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009008 if (!sub_obj) {
9009 Py_DECREF(str_obj);
9010 return -1;
9011 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009012 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009013 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 Py_DECREF(str_obj);
9015 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 }
Tim Petersced69f82003-09-16 20:30:58 +00009017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 kind1 = PyUnicode_KIND(str_obj);
9019 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009020 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009023 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009024 if (kind2 > kind) {
9025 Py_DECREF(sub_obj);
9026 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009027 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009028 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009029 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 if (!buf2)
9032 goto onError;
9033 len1 = PyUnicode_GET_LENGTH(str_obj);
9034 len2 = PyUnicode_GET_LENGTH(sub_obj);
9035
9036 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009037 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009039 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9040 result = asciilib_count(
9041 ((Py_UCS1*)buf1) + start, end - start,
9042 buf2, len2, PY_SSIZE_T_MAX
9043 );
9044 else
9045 result = ucs1lib_count(
9046 ((Py_UCS1*)buf1) + start, end - start,
9047 buf2, len2, PY_SSIZE_T_MAX
9048 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 break;
9050 case PyUnicode_2BYTE_KIND:
9051 result = ucs2lib_count(
9052 ((Py_UCS2*)buf1) + start, end - start,
9053 buf2, len2, PY_SSIZE_T_MAX
9054 );
9055 break;
9056 case PyUnicode_4BYTE_KIND:
9057 result = ucs4lib_count(
9058 ((Py_UCS4*)buf1) + start, end - start,
9059 buf2, len2, PY_SSIZE_T_MAX
9060 );
9061 break;
9062 default:
9063 assert(0); result = 0;
9064 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009065
9066 Py_DECREF(sub_obj);
9067 Py_DECREF(str_obj);
9068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 if (kind2 != kind)
9070 PyMem_Free(buf2);
9071
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 onError:
9074 Py_DECREF(sub_obj);
9075 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 if (kind2 != kind && buf2)
9077 PyMem_Free(buf2);
9078 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079}
9080
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081Py_ssize_t
9082PyUnicode_Find(PyObject *str,
9083 PyObject *sub,
9084 Py_ssize_t start,
9085 Py_ssize_t end,
9086 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009088 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009089
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009091 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009093 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009094 if (!sub) {
9095 Py_DECREF(str);
9096 return -2;
9097 }
9098 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9099 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 Py_DECREF(str);
9101 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 }
Tim Petersced69f82003-09-16 20:30:58 +00009103
Victor Stinner794d5672011-10-10 03:21:36 +02009104 result = any_find_slice(direction,
9105 str, sub, start, end
9106 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009107
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009109 Py_DECREF(sub);
9110
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 return result;
9112}
9113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114Py_ssize_t
9115PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9116 Py_ssize_t start, Py_ssize_t end,
9117 int direction)
9118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009120 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 if (PyUnicode_READY(str) == -1)
9122 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009123 if (start < 0 || end < 0) {
9124 PyErr_SetString(PyExc_IndexError, "string index out of range");
9125 return -2;
9126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 if (end > PyUnicode_GET_LENGTH(str))
9128 end = PyUnicode_GET_LENGTH(str);
9129 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009130 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9131 kind, end-start, ch, direction);
9132 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009134 else
9135 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136}
9137
Alexander Belopolsky40018472011-02-26 01:02:56 +00009138static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009139tailmatch(PyObject *self,
9140 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009141 Py_ssize_t start,
9142 Py_ssize_t end,
9143 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 int kind_self;
9146 int kind_sub;
9147 void *data_self;
9148 void *data_sub;
9149 Py_ssize_t offset;
9150 Py_ssize_t i;
9151 Py_ssize_t end_sub;
9152
9153 if (PyUnicode_READY(self) == -1 ||
9154 PyUnicode_READY(substring) == -1)
9155 return 0;
9156
9157 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 return 1;
9159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9161 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165 kind_self = PyUnicode_KIND(self);
9166 data_self = PyUnicode_DATA(self);
9167 kind_sub = PyUnicode_KIND(substring);
9168 data_sub = PyUnicode_DATA(substring);
9169 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9170
9171 if (direction > 0)
9172 offset = end;
9173 else
9174 offset = start;
9175
9176 if (PyUnicode_READ(kind_self, data_self, offset) ==
9177 PyUnicode_READ(kind_sub, data_sub, 0) &&
9178 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9179 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9180 /* If both are of the same kind, memcmp is sufficient */
9181 if (kind_self == kind_sub) {
9182 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009183 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 data_sub,
9185 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009186 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 }
9188 /* otherwise we have to compare each character by first accesing it */
9189 else {
9190 /* We do not need to compare 0 and len(substring)-1 because
9191 the if statement above ensured already that they are equal
9192 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009193 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 for (i = 1; i < end_sub; ++i) {
9195 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9196 PyUnicode_READ(kind_sub, data_sub, i))
9197 return 0;
9198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 }
9202
9203 return 0;
9204}
9205
Alexander Belopolsky40018472011-02-26 01:02:56 +00009206Py_ssize_t
9207PyUnicode_Tailmatch(PyObject *str,
9208 PyObject *substr,
9209 Py_ssize_t start,
9210 Py_ssize_t end,
9211 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009213 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009214
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 str = PyUnicode_FromObject(str);
9216 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 substr = PyUnicode_FromObject(substr);
9219 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 Py_DECREF(str);
9221 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 }
Tim Petersced69f82003-09-16 20:30:58 +00009223
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009224 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 Py_DECREF(str);
9227 Py_DECREF(substr);
9228 return result;
9229}
9230
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231/* Apply fixfct filter to the Unicode object self and return a
9232 reference to the modified object */
9233
Alexander Belopolsky40018472011-02-26 01:02:56 +00009234static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009235fixup(PyObject *self,
9236 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 PyObject *u;
9239 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009240 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009242 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009245 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 /* fix functions return the new maximum character in a string,
9248 if the kind of the resulting unicode object does not change,
9249 everything is fine. Otherwise we need to change the string kind
9250 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009251 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009252
9253 if (maxchar_new == 0) {
9254 /* no changes */;
9255 if (PyUnicode_CheckExact(self)) {
9256 Py_DECREF(u);
9257 Py_INCREF(self);
9258 return self;
9259 }
9260 else
9261 return u;
9262 }
9263
Victor Stinnere6abb482012-05-02 01:15:40 +02009264 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265
Victor Stinnereaab6042011-12-11 22:22:39 +01009266 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009268
9269 /* In case the maximum character changed, we need to
9270 convert the string to the new category. */
9271 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9272 if (v == NULL) {
9273 Py_DECREF(u);
9274 return NULL;
9275 }
9276 if (maxchar_new > maxchar_old) {
9277 /* If the maxchar increased so that the kind changed, not all
9278 characters are representable anymore and we need to fix the
9279 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009280 _PyUnicode_FastCopyCharacters(v, 0,
9281 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009282 maxchar_old = fixfct(v);
9283 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 }
9285 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009286 _PyUnicode_FastCopyCharacters(v, 0,
9287 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009289 Py_DECREF(u);
9290 assert(_PyUnicode_CheckConsistency(v, 1));
9291 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292}
9293
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009294static PyObject *
9295ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009297 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9298 char *resdata, *data = PyUnicode_DATA(self);
9299 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009300
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301 res = PyUnicode_New(len, 127);
9302 if (res == NULL)
9303 return NULL;
9304 resdata = PyUnicode_DATA(res);
9305 if (lower)
9306 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308 _Py_bytes_upper(resdata, data, len);
9309 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310}
9311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009313handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009315 Py_ssize_t j;
9316 int final_sigma;
9317 Py_UCS4 c;
9318 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009319
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009320 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9321
9322 where ! is a negation and \p{xxx} is a character with property xxx.
9323 */
9324 for (j = i - 1; j >= 0; j--) {
9325 c = PyUnicode_READ(kind, data, j);
9326 if (!_PyUnicode_IsCaseIgnorable(c))
9327 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009329 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9330 if (final_sigma) {
9331 for (j = i + 1; j < length; j++) {
9332 c = PyUnicode_READ(kind, data, j);
9333 if (!_PyUnicode_IsCaseIgnorable(c))
9334 break;
9335 }
9336 final_sigma = j == length || !_PyUnicode_IsCased(c);
9337 }
9338 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339}
9340
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009341static int
9342lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9343 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009345 /* Obscure special case. */
9346 if (c == 0x3A3) {
9347 mapped[0] = handle_capital_sigma(kind, data, length, i);
9348 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009350 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351}
9352
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009353static Py_ssize_t
9354do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009356 Py_ssize_t i, k = 0;
9357 int n_res, j;
9358 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009359
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009360 c = PyUnicode_READ(kind, data, 0);
9361 n_res = _PyUnicode_ToUpperFull(c, mapped);
9362 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009363 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009364 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009366 for (i = 1; i < length; i++) {
9367 c = PyUnicode_READ(kind, data, i);
9368 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9369 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009370 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009371 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009372 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009373 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009374 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375}
9376
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009377static Py_ssize_t
9378do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9379 Py_ssize_t i, k = 0;
9380
9381 for (i = 0; i < length; i++) {
9382 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9383 int n_res, j;
9384 if (Py_UNICODE_ISUPPER(c)) {
9385 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9386 }
9387 else if (Py_UNICODE_ISLOWER(c)) {
9388 n_res = _PyUnicode_ToUpperFull(c, mapped);
9389 }
9390 else {
9391 n_res = 1;
9392 mapped[0] = c;
9393 }
9394 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009395 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009396 res[k++] = mapped[j];
9397 }
9398 }
9399 return k;
9400}
9401
9402static Py_ssize_t
9403do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9404 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009406 Py_ssize_t i, k = 0;
9407
9408 for (i = 0; i < length; i++) {
9409 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9410 int n_res, j;
9411 if (lower)
9412 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9413 else
9414 n_res = _PyUnicode_ToUpperFull(c, mapped);
9415 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009416 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009417 res[k++] = mapped[j];
9418 }
9419 }
9420 return k;
9421}
9422
9423static Py_ssize_t
9424do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9425{
9426 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9427}
9428
9429static Py_ssize_t
9430do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9431{
9432 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9433}
9434
Benjamin Petersone51757f2012-01-12 21:10:29 -05009435static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009436do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9437{
9438 Py_ssize_t i, k = 0;
9439
9440 for (i = 0; i < length; i++) {
9441 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9442 Py_UCS4 mapped[3];
9443 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9444 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009445 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009446 res[k++] = mapped[j];
9447 }
9448 }
9449 return k;
9450}
9451
9452static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009453do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9454{
9455 Py_ssize_t i, k = 0;
9456 int previous_is_cased;
9457
9458 previous_is_cased = 0;
9459 for (i = 0; i < length; i++) {
9460 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9461 Py_UCS4 mapped[3];
9462 int n_res, j;
9463
9464 if (previous_is_cased)
9465 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9466 else
9467 n_res = _PyUnicode_ToTitleFull(c, mapped);
9468
9469 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009470 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009471 res[k++] = mapped[j];
9472 }
9473
9474 previous_is_cased = _PyUnicode_IsCased(c);
9475 }
9476 return k;
9477}
9478
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009479static PyObject *
9480case_operation(PyObject *self,
9481 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9482{
9483 PyObject *res = NULL;
9484 Py_ssize_t length, newlength = 0;
9485 int kind, outkind;
9486 void *data, *outdata;
9487 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9488
Benjamin Petersoneea48462012-01-16 14:28:50 -05009489 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009490
9491 kind = PyUnicode_KIND(self);
9492 data = PyUnicode_DATA(self);
9493 length = PyUnicode_GET_LENGTH(self);
Antoine Pitroub6dc9b72014-10-15 23:14:53 +02009494 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009495 PyErr_SetString(PyExc_OverflowError, "string is too long");
9496 return NULL;
9497 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009498 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009499 if (tmp == NULL)
9500 return PyErr_NoMemory();
9501 newlength = perform(kind, data, length, tmp, &maxchar);
9502 res = PyUnicode_New(newlength, maxchar);
9503 if (res == NULL)
9504 goto leave;
9505 tmpend = tmp + newlength;
9506 outdata = PyUnicode_DATA(res);
9507 outkind = PyUnicode_KIND(res);
9508 switch (outkind) {
9509 case PyUnicode_1BYTE_KIND:
9510 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9511 break;
9512 case PyUnicode_2BYTE_KIND:
9513 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9514 break;
9515 case PyUnicode_4BYTE_KIND:
9516 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9517 break;
9518 default:
9519 assert(0);
9520 break;
9521 }
9522 leave:
9523 PyMem_FREE(tmp);
9524 return res;
9525}
9526
Tim Peters8ce9f162004-08-27 01:49:32 +00009527PyObject *
9528PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009531 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009533 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009534 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9535 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009536 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009538 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009540 int use_memcpy;
9541 unsigned char *res_data = NULL, *sep_data = NULL;
9542 PyObject *last_obj;
9543 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009545 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009546 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009547 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009548 }
9549
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009550 /* NOTE: the following code can't call back into Python code,
9551 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009552 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009553
Tim Peters05eba1f2004-08-27 21:32:02 +00009554 seqlen = PySequence_Fast_GET_SIZE(fseq);
9555 /* If empty sequence, return u"". */
9556 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009557 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009558 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009559 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009560
Tim Peters05eba1f2004-08-27 21:32:02 +00009561 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009562 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009563 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009564 if (seqlen == 1) {
9565 if (PyUnicode_CheckExact(items[0])) {
9566 res = items[0];
9567 Py_INCREF(res);
9568 Py_DECREF(fseq);
9569 return res;
9570 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009571 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009572 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009573 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009575 /* Set up sep and seplen */
9576 if (separator == NULL) {
9577 /* fall back to a blank space separator */
9578 sep = PyUnicode_FromOrdinal(' ');
9579 if (!sep)
9580 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009581 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009582 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009583 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009584 else {
9585 if (!PyUnicode_Check(separator)) {
9586 PyErr_Format(PyExc_TypeError,
9587 "separator: expected str instance,"
9588 " %.80s found",
9589 Py_TYPE(separator)->tp_name);
9590 goto onError;
9591 }
9592 if (PyUnicode_READY(separator))
9593 goto onError;
9594 sep = separator;
9595 seplen = PyUnicode_GET_LENGTH(separator);
9596 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9597 /* inc refcount to keep this code path symmetric with the
9598 above case of a blank separator */
9599 Py_INCREF(sep);
9600 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009601 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009602 }
9603
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009604 /* There are at least two things to join, or else we have a subclass
9605 * of str in the sequence.
9606 * Do a pre-pass to figure out the total amount of space we'll
9607 * need (sz), and see whether all argument are strings.
9608 */
9609 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009610#ifdef Py_DEBUG
9611 use_memcpy = 0;
9612#else
9613 use_memcpy = 1;
9614#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009615 for (i = 0; i < seqlen; i++) {
9616 const Py_ssize_t old_sz = sz;
9617 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 if (!PyUnicode_Check(item)) {
9619 PyErr_Format(PyExc_TypeError,
9620 "sequence item %zd: expected str instance,"
9621 " %.80s found",
9622 i, Py_TYPE(item)->tp_name);
9623 goto onError;
9624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 if (PyUnicode_READY(item) == -1)
9626 goto onError;
9627 sz += PyUnicode_GET_LENGTH(item);
9628 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009629 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009630 if (i != 0)
9631 sz += seplen;
9632 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9633 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009635 goto onError;
9636 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009637 if (use_memcpy && last_obj != NULL) {
9638 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9639 use_memcpy = 0;
9640 }
9641 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009642 }
Tim Petersced69f82003-09-16 20:30:58 +00009643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009645 if (res == NULL)
9646 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009647
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009648 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009649#ifdef Py_DEBUG
9650 use_memcpy = 0;
9651#else
9652 if (use_memcpy) {
9653 res_data = PyUnicode_1BYTE_DATA(res);
9654 kind = PyUnicode_KIND(res);
9655 if (seplen != 0)
9656 sep_data = PyUnicode_1BYTE_DATA(sep);
9657 }
9658#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009660 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009661 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009663 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009664 if (use_memcpy) {
9665 Py_MEMCPY(res_data,
9666 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009667 kind * seplen);
9668 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009669 }
9670 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009671 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009672 res_offset += seplen;
9673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009675 itemlen = PyUnicode_GET_LENGTH(item);
9676 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009677 if (use_memcpy) {
9678 Py_MEMCPY(res_data,
9679 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009680 kind * itemlen);
9681 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 }
9683 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009684 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009685 res_offset += itemlen;
9686 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009687 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009688 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009689 if (use_memcpy)
9690 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009691 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009692 else
9693 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009694
Tim Peters05eba1f2004-08-27 21:32:02 +00009695 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009697 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009701 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009703 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704 return NULL;
9705}
9706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707#define FILL(kind, data, value, start, length) \
9708 do { \
9709 Py_ssize_t i_ = 0; \
9710 assert(kind != PyUnicode_WCHAR_KIND); \
9711 switch ((kind)) { \
9712 case PyUnicode_1BYTE_KIND: { \
9713 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009714 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 break; \
9716 } \
9717 case PyUnicode_2BYTE_KIND: { \
9718 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9719 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9720 break; \
9721 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009722 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9724 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9725 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009726 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 } \
9728 } \
9729 } while (0)
9730
Victor Stinnerd3f08822012-05-29 12:57:52 +02009731void
9732_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9733 Py_UCS4 fill_char)
9734{
9735 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9736 const void *data = PyUnicode_DATA(unicode);
9737 assert(PyUnicode_IS_READY(unicode));
9738 assert(unicode_modifiable(unicode));
9739 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9740 assert(start >= 0);
9741 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9742 FILL(kind, data, fill_char, start, length);
9743}
9744
Victor Stinner3fe55312012-01-04 00:33:50 +01009745Py_ssize_t
9746PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9747 Py_UCS4 fill_char)
9748{
9749 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009750
9751 if (!PyUnicode_Check(unicode)) {
9752 PyErr_BadInternalCall();
9753 return -1;
9754 }
9755 if (PyUnicode_READY(unicode) == -1)
9756 return -1;
9757 if (unicode_check_modifiable(unicode))
9758 return -1;
9759
Victor Stinnerd3f08822012-05-29 12:57:52 +02009760 if (start < 0) {
9761 PyErr_SetString(PyExc_IndexError, "string index out of range");
9762 return -1;
9763 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009764 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9765 PyErr_SetString(PyExc_ValueError,
9766 "fill character is bigger than "
9767 "the string maximum character");
9768 return -1;
9769 }
9770
9771 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9772 length = Py_MIN(maxlen, length);
9773 if (length <= 0)
9774 return 0;
9775
Victor Stinnerd3f08822012-05-29 12:57:52 +02009776 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009777 return length;
9778}
9779
Victor Stinner9310abb2011-10-05 00:59:23 +02009780static PyObject *
9781pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009782 Py_ssize_t left,
9783 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 PyObject *u;
9787 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009788 int kind;
9789 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790
9791 if (left < 0)
9792 left = 0;
9793 if (right < 0)
9794 right = 0;
9795
Victor Stinnerc4b49542011-12-11 22:44:26 +01009796 if (left == 0 && right == 0)
9797 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9800 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009801 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9802 return NULL;
9803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009807 if (!u)
9808 return NULL;
9809
9810 kind = PyUnicode_KIND(u);
9811 data = PyUnicode_DATA(u);
9812 if (left)
9813 FILL(kind, data, fill, 0, left);
9814 if (right)
9815 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009816 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009817 assert(_PyUnicode_CheckConsistency(u, 1));
9818 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819}
9820
Alexander Belopolsky40018472011-02-26 01:02:56 +00009821PyObject *
9822PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825
9826 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009827 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009829 if (PyUnicode_READY(string) == -1) {
9830 Py_DECREF(string);
9831 return NULL;
9832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833
Benjamin Petersonead6b532011-12-20 17:23:42 -06009834 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009836 if (PyUnicode_IS_ASCII(string))
9837 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009838 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009839 PyUnicode_GET_LENGTH(string), keepends);
9840 else
9841 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009842 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009843 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 break;
9845 case PyUnicode_2BYTE_KIND:
9846 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 PyUnicode_GET_LENGTH(string), keepends);
9849 break;
9850 case PyUnicode_4BYTE_KIND:
9851 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009852 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 PyUnicode_GET_LENGTH(string), keepends);
9854 break;
9855 default:
9856 assert(0);
9857 list = 0;
9858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 Py_DECREF(string);
9860 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861}
9862
Alexander Belopolsky40018472011-02-26 01:02:56 +00009863static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009864split(PyObject *self,
9865 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009866 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 int kind1, kind2, kind;
9869 void *buf1, *buf2;
9870 Py_ssize_t len1, len2;
9871 PyObject* out;
9872
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009874 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 if (PyUnicode_READY(self) == -1)
9877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009880 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009882 if (PyUnicode_IS_ASCII(self))
9883 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009884 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009885 PyUnicode_GET_LENGTH(self), maxcount
9886 );
9887 else
9888 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009889 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009890 PyUnicode_GET_LENGTH(self), maxcount
9891 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 case PyUnicode_2BYTE_KIND:
9893 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009894 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 PyUnicode_GET_LENGTH(self), maxcount
9896 );
9897 case PyUnicode_4BYTE_KIND:
9898 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009899 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 PyUnicode_GET_LENGTH(self), maxcount
9901 );
9902 default:
9903 assert(0);
9904 return NULL;
9905 }
9906
9907 if (PyUnicode_READY(substring) == -1)
9908 return NULL;
9909
9910 kind1 = PyUnicode_KIND(self);
9911 kind2 = PyUnicode_KIND(substring);
9912 kind = kind1 > kind2 ? kind1 : kind2;
9913 buf1 = PyUnicode_DATA(self);
9914 buf2 = PyUnicode_DATA(substring);
9915 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009916 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (!buf1)
9918 return NULL;
9919 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 if (!buf2) {
9922 if (kind1 != kind) PyMem_Free(buf1);
9923 return NULL;
9924 }
9925 len1 = PyUnicode_GET_LENGTH(self);
9926 len2 = PyUnicode_GET_LENGTH(substring);
9927
Benjamin Petersonead6b532011-12-20 17:23:42 -06009928 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009930 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9931 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009933 else
9934 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009935 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 break;
9937 case PyUnicode_2BYTE_KIND:
9938 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009939 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 break;
9941 case PyUnicode_4BYTE_KIND:
9942 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009943 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 break;
9945 default:
9946 out = NULL;
9947 }
9948 if (kind1 != kind)
9949 PyMem_Free(buf1);
9950 if (kind2 != kind)
9951 PyMem_Free(buf2);
9952 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953}
9954
Alexander Belopolsky40018472011-02-26 01:02:56 +00009955static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009956rsplit(PyObject *self,
9957 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009958 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 int kind1, kind2, kind;
9961 void *buf1, *buf2;
9962 Py_ssize_t len1, len2;
9963 PyObject* out;
9964
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009965 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009966 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 if (PyUnicode_READY(self) == -1)
9969 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009972 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974 if (PyUnicode_IS_ASCII(self))
9975 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009976 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009977 PyUnicode_GET_LENGTH(self), maxcount
9978 );
9979 else
9980 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 PyUnicode_GET_LENGTH(self), maxcount
9983 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 case PyUnicode_2BYTE_KIND:
9985 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009986 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 PyUnicode_GET_LENGTH(self), maxcount
9988 );
9989 case PyUnicode_4BYTE_KIND:
9990 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009991 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 PyUnicode_GET_LENGTH(self), maxcount
9993 );
9994 default:
9995 assert(0);
9996 return NULL;
9997 }
9998
9999 if (PyUnicode_READY(substring) == -1)
10000 return NULL;
10001
10002 kind1 = PyUnicode_KIND(self);
10003 kind2 = PyUnicode_KIND(substring);
10004 kind = kind1 > kind2 ? kind1 : kind2;
10005 buf1 = PyUnicode_DATA(self);
10006 buf2 = PyUnicode_DATA(substring);
10007 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010008 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (!buf1)
10010 return NULL;
10011 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 if (!buf2) {
10014 if (kind1 != kind) PyMem_Free(buf1);
10015 return NULL;
10016 }
10017 len1 = PyUnicode_GET_LENGTH(self);
10018 len2 = PyUnicode_GET_LENGTH(substring);
10019
Benjamin Petersonead6b532011-12-20 17:23:42 -060010020 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010022 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10023 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010024 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010025 else
10026 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010027 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 break;
10029 case PyUnicode_2BYTE_KIND:
10030 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010031 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 break;
10033 case PyUnicode_4BYTE_KIND:
10034 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010035 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 break;
10037 default:
10038 out = NULL;
10039 }
10040 if (kind1 != kind)
10041 PyMem_Free(buf1);
10042 if (kind2 != kind)
10043 PyMem_Free(buf2);
10044 return out;
10045}
10046
10047static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010048anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10049 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010051 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10054 return asciilib_find(buf1, len1, buf2, len2, offset);
10055 else
10056 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 case PyUnicode_2BYTE_KIND:
10058 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10059 case PyUnicode_4BYTE_KIND:
10060 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10061 }
10062 assert(0);
10063 return -1;
10064}
10065
10066static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010067anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10068 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010070 switch (kind) {
10071 case PyUnicode_1BYTE_KIND:
10072 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10073 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10074 else
10075 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10076 case PyUnicode_2BYTE_KIND:
10077 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10078 case PyUnicode_4BYTE_KIND:
10079 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10080 }
10081 assert(0);
10082 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010083}
10084
Alexander Belopolsky40018472011-02-26 01:02:56 +000010085static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086replace(PyObject *self, PyObject *str1,
10087 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 PyObject *u;
10090 char *sbuf = PyUnicode_DATA(self);
10091 char *buf1 = PyUnicode_DATA(str1);
10092 char *buf2 = PyUnicode_DATA(str2);
10093 int srelease = 0, release1 = 0, release2 = 0;
10094 int skind = PyUnicode_KIND(self);
10095 int kind1 = PyUnicode_KIND(str1);
10096 int kind2 = PyUnicode_KIND(str2);
10097 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10098 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10099 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 int mayshrink;
10101 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
10103 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010106 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
Victor Stinner59de0ee2011-10-07 10:01:28 +020010108 if (str1 == str2)
10109 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (skind < kind1)
10111 /* substring too wide to be present */
10112 goto nothing;
10113
Victor Stinner49a0a212011-10-12 23:46:10 +020010114 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10115 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10116 /* Replacing str1 with str2 may cause a maxchar reduction in the
10117 result string. */
10118 mayshrink = (maxchar_str2 < maxchar);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010119 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010122 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010124 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010126 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010127 Py_UCS4 u1, u2;
10128 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010129 Py_ssize_t index, pos;
10130 char *src;
10131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010133 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10134 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010135 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010138 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010140 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010142
10143 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10144 index = 0;
10145 src = sbuf;
10146 while (--maxcount)
10147 {
10148 pos++;
10149 src += pos * PyUnicode_KIND(self);
10150 slen -= pos;
10151 index += pos;
10152 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10153 if (pos < 0)
10154 break;
10155 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10156 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010157 }
10158 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 int rkind = skind;
10160 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010161 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (kind1 < rkind) {
10164 /* widen substring */
10165 buf1 = _PyUnicode_AsKind(str1, rkind);
10166 if (!buf1) goto error;
10167 release1 = 1;
10168 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010170 if (i < 0)
10171 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 if (rkind > kind2) {
10173 /* widen replacement */
10174 buf2 = _PyUnicode_AsKind(str2, rkind);
10175 if (!buf2) goto error;
10176 release2 = 1;
10177 }
10178 else if (rkind < kind2) {
10179 /* widen self and buf1 */
10180 rkind = kind2;
10181 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010182 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 sbuf = _PyUnicode_AsKind(self, rkind);
10184 if (!sbuf) goto error;
10185 srelease = 1;
10186 buf1 = _PyUnicode_AsKind(str1, rkind);
10187 if (!buf1) goto error;
10188 release1 = 1;
10189 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010190 u = PyUnicode_New(slen, maxchar);
10191 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 assert(PyUnicode_KIND(u) == rkind);
10194 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010195
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010196 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010197 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010198 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010200 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010202
10203 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010205 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010207 if (i == -1)
10208 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010209 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010211 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010215 }
10216 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010218 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 int rkind = skind;
10220 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010223 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 buf1 = _PyUnicode_AsKind(str1, rkind);
10225 if (!buf1) goto error;
10226 release1 = 1;
10227 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010229 if (n == 0)
10230 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010232 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 buf2 = _PyUnicode_AsKind(str2, rkind);
10234 if (!buf2) goto error;
10235 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010238 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 rkind = kind2;
10240 sbuf = _PyUnicode_AsKind(self, rkind);
10241 if (!sbuf) goto error;
10242 srelease = 1;
10243 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010244 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf1 = _PyUnicode_AsKind(str1, rkind);
10246 if (!buf1) goto error;
10247 release1 = 1;
10248 }
10249 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10250 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010251 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 PyErr_SetString(PyExc_OverflowError,
10253 "replace string is too long");
10254 goto error;
10255 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010256 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010258 _Py_INCREF_UNICODE_EMPTY();
10259 if (!unicode_empty)
10260 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010261 u = unicode_empty;
10262 goto done;
10263 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010264 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 PyErr_SetString(PyExc_OverflowError,
10266 "replace string is too long");
10267 goto error;
10268 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010269 u = PyUnicode_New(new_size, maxchar);
10270 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010272 assert(PyUnicode_KIND(u) == rkind);
10273 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 ires = i = 0;
10275 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010276 while (n-- > 0) {
10277 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010279 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010281 if (j == -1)
10282 break;
10283 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010285 memcpy(res + rkind * ires,
10286 sbuf + rkind * i,
10287 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289 }
10290 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010292 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010294 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010300 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010301 memcpy(res + rkind * ires,
10302 sbuf + rkind * i,
10303 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010304 }
10305 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010306 /* interleave */
10307 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010308 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010310 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 if (--n <= 0)
10313 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010314 memcpy(res + rkind * ires,
10315 sbuf + rkind * i,
10316 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 ires++;
10318 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010320 memcpy(res + rkind * ires,
10321 sbuf + rkind * i,
10322 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010324 }
10325
10326 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010327 unicode_adjust_maxchar(&u);
10328 if (u == NULL)
10329 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010331
10332 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (srelease)
10334 PyMem_FREE(sbuf);
10335 if (release1)
10336 PyMem_FREE(buf1);
10337 if (release2)
10338 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010339 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (srelease)
10345 PyMem_FREE(sbuf);
10346 if (release1)
10347 PyMem_FREE(buf1);
10348 if (release2)
10349 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010350 return unicode_result_unchanged(self);
10351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 error:
10353 if (srelease && sbuf)
10354 PyMem_FREE(sbuf);
10355 if (release1 && buf1)
10356 PyMem_FREE(buf1);
10357 if (release2 && buf2)
10358 PyMem_FREE(buf2);
10359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360}
10361
10362/* --- Unicode Object Methods --------------------------------------------- */
10363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010364PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366\n\
10367Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010368characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369
10370static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010371unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010373 if (PyUnicode_READY(self) == -1)
10374 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010375 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376}
10377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010378PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380\n\
10381Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010382have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383
10384static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010385unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010387 if (PyUnicode_READY(self) == -1)
10388 return NULL;
10389 if (PyUnicode_GET_LENGTH(self) == 0)
10390 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010391 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392}
10393
Benjamin Petersond5890c82012-01-14 13:23:30 -050010394PyDoc_STRVAR(casefold__doc__,
10395 "S.casefold() -> str\n\
10396\n\
10397Return a version of S suitable for caseless comparisons.");
10398
10399static PyObject *
10400unicode_casefold(PyObject *self)
10401{
10402 if (PyUnicode_READY(self) == -1)
10403 return NULL;
10404 if (PyUnicode_IS_ASCII(self))
10405 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010406 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010407}
10408
10409
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010410/* Argument converter. Coerces to a single unicode character */
10411
10412static int
10413convert_uc(PyObject *obj, void *addr)
10414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010416 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010417
Benjamin Peterson14339b62009-01-31 16:36:08 +000010418 uniobj = PyUnicode_FromObject(obj);
10419 if (uniobj == NULL) {
10420 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010422 return 0;
10423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010425 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010427 Py_DECREF(uniobj);
10428 return 0;
10429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010431 Py_DECREF(uniobj);
10432 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010433}
10434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010435PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010436 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010438Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010439done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
10441static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010442unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010444 Py_ssize_t marg, left;
10445 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 Py_UCS4 fillchar = ' ';
10447
Victor Stinnere9a29352011-10-01 02:14:59 +020010448 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450
Benjamin Petersonbac79492012-01-14 13:34:47 -050010451 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452 return NULL;
10453
Victor Stinnerc4b49542011-12-11 22:44:26 +010010454 if (PyUnicode_GET_LENGTH(self) >= width)
10455 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456
Victor Stinnerc4b49542011-12-11 22:44:26 +010010457 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458 left = marg / 2 + (marg & width & 1);
10459
Victor Stinner9310abb2011-10-05 00:59:23 +020010460 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461}
10462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463/* This function assumes that str1 and str2 are readied by the caller. */
10464
Marc-André Lemburge5034372000-08-08 08:04:29 +000010465static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010466unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 int kind1, kind2;
10469 void *data1, *data2;
10470 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 kind1 = PyUnicode_KIND(str1);
10473 kind2 = PyUnicode_KIND(str2);
10474 data1 = PyUnicode_DATA(str1);
10475 data2 = PyUnicode_DATA(str2);
10476 len1 = PyUnicode_GET_LENGTH(str1);
10477 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 for (i = 0; i < len1 && i < len2; ++i) {
10480 Py_UCS4 c1, c2;
10481 c1 = PyUnicode_READ(kind1, data1, i);
10482 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010483
10484 if (c1 != c2)
10485 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010486 }
10487
10488 return (len1 < len2) ? -1 : (len1 != len2);
10489}
10490
Alexander Belopolsky40018472011-02-26 01:02:56 +000010491int
10492PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10495 if (PyUnicode_READY(left) == -1 ||
10496 PyUnicode_READY(right) == -1)
10497 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010498 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010500 PyErr_Format(PyExc_TypeError,
10501 "Can't compare %.100s and %.100s",
10502 left->ob_type->tp_name,
10503 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504 return -1;
10505}
10506
Martin v. Löwis5b222132007-06-10 09:51:05 +000010507int
10508PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 Py_ssize_t i;
10511 int kind;
10512 void *data;
10513 Py_UCS4 chr;
10514
Victor Stinner910337b2011-10-03 03:20:16 +020010515 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (PyUnicode_READY(uni) == -1)
10517 return -1;
10518 kind = PyUnicode_KIND(uni);
10519 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010520 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10522 if (chr != str[i])
10523 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010524 /* This check keeps Python strings that end in '\0' from comparing equal
10525 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010528 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010529 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010530 return 0;
10531}
10532
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010533
Benjamin Peterson29060642009-01-31 22:14:21 +000010534#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010535 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010536
Alexander Belopolsky40018472011-02-26 01:02:56 +000010537PyObject *
10538PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010539{
10540 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010542 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10543 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (PyUnicode_READY(left) == -1 ||
10545 PyUnicode_READY(right) == -1)
10546 return NULL;
10547 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10548 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010549 if (op == Py_EQ) {
10550 Py_INCREF(Py_False);
10551 return Py_False;
10552 }
10553 if (op == Py_NE) {
10554 Py_INCREF(Py_True);
10555 return Py_True;
10556 }
10557 }
10558 if (left == right)
10559 result = 0;
10560 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010561 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010562
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010563 /* Convert the return value to a Boolean */
10564 switch (op) {
10565 case Py_EQ:
10566 v = TEST_COND(result == 0);
10567 break;
10568 case Py_NE:
10569 v = TEST_COND(result != 0);
10570 break;
10571 case Py_LE:
10572 v = TEST_COND(result <= 0);
10573 break;
10574 case Py_GE:
10575 v = TEST_COND(result >= 0);
10576 break;
10577 case Py_LT:
10578 v = TEST_COND(result == -1);
10579 break;
10580 case Py_GT:
10581 v = TEST_COND(result == 1);
10582 break;
10583 default:
10584 PyErr_BadArgument();
10585 return NULL;
10586 }
10587 Py_INCREF(v);
10588 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010590
Brian Curtindfc80e32011-08-10 20:28:54 -050010591 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010592}
10593
Alexander Belopolsky40018472011-02-26 01:02:56 +000010594int
10595PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010596{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 int kind1, kind2, kind;
10599 void *buf1, *buf2;
10600 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010601 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010602
10603 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 sub = PyUnicode_FromObject(element);
10605 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010606 PyErr_Format(PyExc_TypeError,
10607 "'in <string>' requires string as left operand, not %s",
10608 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010610 }
10611
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010613 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010614 Py_DECREF(sub);
10615 return -1;
10616 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010617 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10618 Py_DECREF(sub);
10619 Py_DECREF(str);
10620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 kind1 = PyUnicode_KIND(str);
10623 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010624 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 buf1 = PyUnicode_DATA(str);
10626 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010627 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010628 if (kind2 > kind) {
10629 Py_DECREF(sub);
10630 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010631 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010632 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010633 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (!buf2) {
10636 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010637 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 return -1;
10639 }
10640 len1 = PyUnicode_GET_LENGTH(str);
10641 len2 = PyUnicode_GET_LENGTH(sub);
10642
Benjamin Petersonead6b532011-12-20 17:23:42 -060010643 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 case PyUnicode_1BYTE_KIND:
10645 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10646 break;
10647 case PyUnicode_2BYTE_KIND:
10648 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10649 break;
10650 case PyUnicode_4BYTE_KIND:
10651 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10652 break;
10653 default:
10654 result = -1;
10655 assert(0);
10656 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657
10658 Py_DECREF(str);
10659 Py_DECREF(sub);
10660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 if (kind2 != kind)
10662 PyMem_Free(buf2);
10663
Guido van Rossum403d68b2000-03-13 15:55:09 +000010664 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010665}
10666
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667/* Concat to string or Unicode object giving a new Unicode object. */
10668
Alexander Belopolsky40018472011-02-26 01:02:56 +000010669PyObject *
10670PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010673 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010674 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675
10676 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010679 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683
10684 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010685 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010689 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692 }
10693
Victor Stinner488fa492011-12-12 00:01:39 +010010694 u_len = PyUnicode_GET_LENGTH(u);
10695 v_len = PyUnicode_GET_LENGTH(v);
10696 if (u_len > PY_SSIZE_T_MAX - v_len) {
10697 PyErr_SetString(PyExc_OverflowError,
10698 "strings are too large to concat");
10699 goto onError;
10700 }
10701 new_len = u_len + v_len;
10702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010704 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010705 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010708 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010710 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010711 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10712 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 Py_DECREF(u);
10714 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010715 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 Py_XDECREF(u);
10720 Py_XDECREF(v);
10721 return NULL;
10722}
10723
Walter Dörwald1ab83302007-05-18 17:15:44 +000010724void
Victor Stinner23e56682011-10-03 03:54:37 +020010725PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010726{
Victor Stinner23e56682011-10-03 03:54:37 +020010727 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010728 Py_UCS4 maxchar, maxchar2;
10729 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010730
10731 if (p_left == NULL) {
10732 if (!PyErr_Occurred())
10733 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010734 return;
10735 }
Victor Stinner23e56682011-10-03 03:54:37 +020010736 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010737 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010738 if (!PyErr_Occurred())
10739 PyErr_BadInternalCall();
10740 goto error;
10741 }
10742
Benjamin Petersonbac79492012-01-14 13:34:47 -050010743 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010744 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010745 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010746 goto error;
10747
Victor Stinner488fa492011-12-12 00:01:39 +010010748 /* Shortcuts */
10749 if (left == unicode_empty) {
10750 Py_DECREF(left);
10751 Py_INCREF(right);
10752 *p_left = right;
10753 return;
10754 }
10755 if (right == unicode_empty)
10756 return;
10757
10758 left_len = PyUnicode_GET_LENGTH(left);
10759 right_len = PyUnicode_GET_LENGTH(right);
10760 if (left_len > PY_SSIZE_T_MAX - right_len) {
10761 PyErr_SetString(PyExc_OverflowError,
10762 "strings are too large to concat");
10763 goto error;
10764 }
10765 new_len = left_len + right_len;
10766
10767 if (unicode_modifiable(left)
10768 && PyUnicode_CheckExact(right)
10769 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010770 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10771 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010772 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010773 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010774 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10775 {
10776 /* append inplace */
10777 if (unicode_resize(p_left, new_len) != 0) {
10778 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10779 * deallocated so it cannot be put back into
10780 * 'variable'. The MemoryError is raised when there
10781 * is no value in 'variable', which might (very
10782 * remotely) be a cause of incompatibilities.
10783 */
10784 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010785 }
Victor Stinner488fa492011-12-12 00:01:39 +010010786 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010787 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010788 }
Victor Stinner488fa492011-12-12 00:01:39 +010010789 else {
10790 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10791 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010792 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010793
Victor Stinner488fa492011-12-12 00:01:39 +010010794 /* Concat the two Unicode strings */
10795 res = PyUnicode_New(new_len, maxchar);
10796 if (res == NULL)
10797 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010798 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10799 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010800 Py_DECREF(left);
10801 *p_left = res;
10802 }
10803 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010804 return;
10805
10806error:
Victor Stinner488fa492011-12-12 00:01:39 +010010807 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010808}
10809
10810void
10811PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10812{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010813 PyUnicode_Append(pleft, right);
10814 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010815}
10816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010817PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010820Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010821string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010822interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
10824static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010825unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010827 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010828 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010829 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 int kind1, kind2, kind;
10832 void *buf1, *buf2;
10833 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
Jesus Ceaac451502011-04-20 17:09:23 +020010835 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10836 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 kind1 = PyUnicode_KIND(self);
10840 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010841 if (kind2 > kind1)
10842 return PyLong_FromLong(0);
10843 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 buf1 = PyUnicode_DATA(self);
10845 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010847 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (!buf2) {
10849 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 return NULL;
10851 }
10852 len1 = PyUnicode_GET_LENGTH(self);
10853 len2 = PyUnicode_GET_LENGTH(substring);
10854
10855 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010856 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 case PyUnicode_1BYTE_KIND:
10858 iresult = ucs1lib_count(
10859 ((Py_UCS1*)buf1) + start, end - start,
10860 buf2, len2, PY_SSIZE_T_MAX
10861 );
10862 break;
10863 case PyUnicode_2BYTE_KIND:
10864 iresult = ucs2lib_count(
10865 ((Py_UCS2*)buf1) + start, end - start,
10866 buf2, len2, PY_SSIZE_T_MAX
10867 );
10868 break;
10869 case PyUnicode_4BYTE_KIND:
10870 iresult = ucs4lib_count(
10871 ((Py_UCS4*)buf1) + start, end - start,
10872 buf2, len2, PY_SSIZE_T_MAX
10873 );
10874 break;
10875 default:
10876 assert(0); iresult = 0;
10877 }
10878
10879 result = PyLong_FromSsize_t(iresult);
10880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 if (kind2 != kind)
10882 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883
10884 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010885
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886 return result;
10887}
10888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010889PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010890 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010892Encode S using the codec registered for encoding. Default encoding\n\
10893is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010894handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010895a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10896'xmlcharrefreplace' as well as any other name registered with\n\
10897codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
10899static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010900unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010902 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 char *encoding = NULL;
10904 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010905
Benjamin Peterson308d6372009-09-18 21:42:35 +000010906 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10907 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010909 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010910}
10911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914\n\
10915Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010916If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
10918static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010919unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010921 Py_ssize_t i, j, line_pos, src_len, incr;
10922 Py_UCS4 ch;
10923 PyObject *u;
10924 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010927 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
10929 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Antoine Pitrou22425222011-10-04 19:10:51 +020010932 if (PyUnicode_READY(self) == -1)
10933 return NULL;
10934
Thomas Wouters7e474022000-07-16 12:04:32 +000010935 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010936 src_len = PyUnicode_GET_LENGTH(self);
10937 i = j = line_pos = 0;
10938 kind = PyUnicode_KIND(self);
10939 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010940 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010941 for (; i < src_len; i++) {
10942 ch = PyUnicode_READ(kind, src_data, i);
10943 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010944 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010946 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010948 goto overflow;
10949 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010951 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010955 goto overflow;
10956 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010958 if (ch == '\n' || ch == '\r')
10959 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010961 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010962 if (!found)
10963 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010964
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010966 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 if (!u)
10968 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010969 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970
Antoine Pitroue71d5742011-10-04 15:55:09 +020010971 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
Antoine Pitroue71d5742011-10-04 15:55:09 +020010973 for (; i < src_len; i++) {
10974 ch = PyUnicode_READ(kind, src_data, i);
10975 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 incr = tabsize - (line_pos % tabsize);
10978 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010979 FILL(kind, dest_data, ' ', j, incr);
10980 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010982 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010984 line_pos++;
10985 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010986 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 if (ch == '\n' || ch == '\r')
10988 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 }
10991 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010992 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010993
Antoine Pitroue71d5742011-10-04 15:55:09 +020010994 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010995 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997}
10998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010999PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001\n\
11002Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011003such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004arguments start and end are interpreted as in slice notation.\n\
11005\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011006Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011011 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011012 Py_ssize_t start;
11013 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Jesus Ceaac451502011-04-20 17:09:23 +020011016 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11017 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 if (PyUnicode_READY(self) == -1)
11021 return NULL;
11022 if (PyUnicode_READY(substring) == -1)
11023 return NULL;
11024
Victor Stinner7931d9a2011-11-04 00:22:48 +010011025 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026
11027 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 if (result == -2)
11030 return NULL;
11031
Christian Heimes217cfd12007-12-02 14:31:20 +000011032 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033}
11034
11035static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011036unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011038 void *data;
11039 enum PyUnicode_Kind kind;
11040 Py_UCS4 ch;
11041 PyObject *res;
11042
11043 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11044 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011046 }
11047 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11048 PyErr_SetString(PyExc_IndexError, "string index out of range");
11049 return NULL;
11050 }
11051 kind = PyUnicode_KIND(self);
11052 data = PyUnicode_DATA(self);
11053 ch = PyUnicode_READ(kind, data, index);
11054 if (ch < 256)
11055 return get_latin1_char(ch);
11056
11057 res = PyUnicode_New(1, ch);
11058 if (res == NULL)
11059 return NULL;
11060 kind = PyUnicode_KIND(res);
11061 data = PyUnicode_DATA(res);
11062 PyUnicode_WRITE(kind, data, 0, ch);
11063 assert(_PyUnicode_CheckConsistency(res, 1));
11064 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065}
11066
Guido van Rossumc2504932007-09-18 19:42:40 +000011067/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011068 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011069static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011070unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071{
Guido van Rossumc2504932007-09-18 19:42:40 +000011072 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011073 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011074
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011075#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011076 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011077#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 if (_PyUnicode_HASH(self) != -1)
11079 return _PyUnicode_HASH(self);
11080 if (PyUnicode_READY(self) == -1)
11081 return -1;
11082 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011083 /*
11084 We make the hash of the empty string be 0, rather than using
11085 (prefix ^ suffix), since this slightly obfuscates the hash secret
11086 */
11087 if (len == 0) {
11088 _PyUnicode_HASH(self) = 0;
11089 return 0;
11090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091
11092 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011093#define HASH(P) \
11094 x ^= (Py_uhash_t) *P << 7; \
11095 while (--len >= 0) \
11096 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097
Georg Brandl2fb477c2012-02-21 00:33:36 +010011098 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 switch (PyUnicode_KIND(self)) {
11100 case PyUnicode_1BYTE_KIND: {
11101 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11102 HASH(c);
11103 break;
11104 }
11105 case PyUnicode_2BYTE_KIND: {
11106 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11107 HASH(s);
11108 break;
11109 }
11110 default: {
11111 Py_UCS4 *l;
11112 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11113 "Impossible switch case in unicode_hash");
11114 l = PyUnicode_4BYTE_DATA(self);
11115 HASH(l);
11116 break;
11117 }
11118 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011119 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11120 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121
Guido van Rossumc2504932007-09-18 19:42:40 +000011122 if (x == -1)
11123 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011125 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011129PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011130 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011132Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
11134static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011137 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011138 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011139 Py_ssize_t start;
11140 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Jesus Ceaac451502011-04-20 17:09:23 +020011142 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11143 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (PyUnicode_READY(self) == -1)
11147 return NULL;
11148 if (PyUnicode_READY(substring) == -1)
11149 return NULL;
11150
Victor Stinner7931d9a2011-11-04 00:22:48 +010011151 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152
11153 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 if (result == -2)
11156 return NULL;
11157
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158 if (result < 0) {
11159 PyErr_SetString(PyExc_ValueError, "substring not found");
11160 return NULL;
11161 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011162
Christian Heimes217cfd12007-12-02 14:31:20 +000011163 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011169Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011173unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 Py_ssize_t i, length;
11176 int kind;
11177 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 int cased;
11179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (PyUnicode_READY(self) == -1)
11181 return NULL;
11182 length = PyUnicode_GET_LENGTH(self);
11183 kind = PyUnicode_KIND(self);
11184 data = PyUnicode_DATA(self);
11185
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (length == 1)
11188 return PyBool_FromLong(
11189 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011191 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011193 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 for (i = 0; i < length; i++) {
11197 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011198
Benjamin Peterson29060642009-01-31 22:14:21 +000011199 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11200 return PyBool_FromLong(0);
11201 else if (!cased && Py_UNICODE_ISLOWER(ch))
11202 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011204 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205}
11206
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011207PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011210Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011214unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 Py_ssize_t i, length;
11217 int kind;
11218 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 int cased;
11220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 if (PyUnicode_READY(self) == -1)
11222 return NULL;
11223 length = PyUnicode_GET_LENGTH(self);
11224 kind = PyUnicode_KIND(self);
11225 data = PyUnicode_DATA(self);
11226
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 if (length == 1)
11229 return PyBool_FromLong(
11230 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011232 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011235
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 for (i = 0; i < length; i++) {
11238 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011239
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11241 return PyBool_FromLong(0);
11242 else if (!cased && Py_UNICODE_ISUPPER(ch))
11243 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011245 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246}
11247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011248PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011251Return True if S is a titlecased string and there is at least one\n\
11252character in S, i.e. upper- and titlecase characters may only\n\
11253follow uncased characters and lowercase characters only cased ones.\n\
11254Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255
11256static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011257unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 Py_ssize_t i, length;
11260 int kind;
11261 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262 int cased, previous_is_cased;
11263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 if (PyUnicode_READY(self) == -1)
11265 return NULL;
11266 length = PyUnicode_GET_LENGTH(self);
11267 kind = PyUnicode_KIND(self);
11268 data = PyUnicode_DATA(self);
11269
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 if (length == 1) {
11272 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11273 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11274 (Py_UNICODE_ISUPPER(ch) != 0));
11275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011277 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011280
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 cased = 0;
11282 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 for (i = 0; i < length; i++) {
11284 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011285
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11287 if (previous_is_cased)
11288 return PyBool_FromLong(0);
11289 previous_is_cased = 1;
11290 cased = 1;
11291 }
11292 else if (Py_UNICODE_ISLOWER(ch)) {
11293 if (!previous_is_cased)
11294 return PyBool_FromLong(0);
11295 previous_is_cased = 1;
11296 cased = 1;
11297 }
11298 else
11299 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011301 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302}
11303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011304PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011307Return True if all characters in S are whitespace\n\
11308and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309
11310static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011311unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 Py_ssize_t i, length;
11314 int kind;
11315 void *data;
11316
11317 if (PyUnicode_READY(self) == -1)
11318 return NULL;
11319 length = PyUnicode_GET_LENGTH(self);
11320 kind = PyUnicode_KIND(self);
11321 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 if (length == 1)
11325 return PyBool_FromLong(
11326 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011328 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011330 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 for (i = 0; i < length; i++) {
11333 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011334 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011337 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338}
11339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011340PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011342\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011343Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011344and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011345
11346static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011347unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 Py_ssize_t i, length;
11350 int kind;
11351 void *data;
11352
11353 if (PyUnicode_READY(self) == -1)
11354 return NULL;
11355 length = PyUnicode_GET_LENGTH(self);
11356 kind = PyUnicode_KIND(self);
11357 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011358
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011359 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (length == 1)
11361 return PyBool_FromLong(
11362 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011363
11364 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 for (i = 0; i < length; i++) {
11369 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011370 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011372 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011373}
11374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011375PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011377\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011378Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011379and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011380
11381static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011382unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 int kind;
11385 void *data;
11386 Py_ssize_t len, i;
11387
11388 if (PyUnicode_READY(self) == -1)
11389 return NULL;
11390
11391 kind = PyUnicode_KIND(self);
11392 data = PyUnicode_DATA(self);
11393 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011394
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 if (len == 1) {
11397 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11398 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11399 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011400
11401 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 for (i = 0; i < len; i++) {
11406 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011407 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011409 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011410 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011416Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011420unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 Py_ssize_t i, length;
11423 int kind;
11424 void *data;
11425
11426 if (PyUnicode_READY(self) == -1)
11427 return NULL;
11428 length = PyUnicode_GET_LENGTH(self);
11429 kind = PyUnicode_KIND(self);
11430 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (length == 1)
11434 return PyBool_FromLong(
11435 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011437 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 for (i = 0; i < length; i++) {
11442 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011445 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446}
11447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011448PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011451Return True if all characters in S are digits\n\
11452and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
11454static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011455unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 Py_ssize_t i, length;
11458 int kind;
11459 void *data;
11460
11461 if (PyUnicode_READY(self) == -1)
11462 return NULL;
11463 length = PyUnicode_GET_LENGTH(self);
11464 kind = PyUnicode_KIND(self);
11465 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 1) {
11469 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11470 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011473 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 for (i = 0; i < length; i++) {
11478 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011481 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482}
11483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011487Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
11490static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011491unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 Py_ssize_t i, length;
11494 int kind;
11495 void *data;
11496
11497 if (PyUnicode_READY(self) == -1)
11498 return NULL;
11499 length = PyUnicode_GET_LENGTH(self);
11500 kind = PyUnicode_KIND(self);
11501 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 1)
11505 return PyBool_FromLong(
11506 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 for (i = 0; i < length; i++) {
11513 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011516 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517}
11518
Martin v. Löwis47383402007-08-15 07:32:56 +000011519int
11520PyUnicode_IsIdentifier(PyObject *self)
11521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 int kind;
11523 void *data;
11524 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011525 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (PyUnicode_READY(self) == -1) {
11528 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 }
11531
11532 /* Special case for empty strings */
11533 if (PyUnicode_GET_LENGTH(self) == 0)
11534 return 0;
11535 kind = PyUnicode_KIND(self);
11536 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011537
11538 /* PEP 3131 says that the first character must be in
11539 XID_Start and subsequent characters in XID_Continue,
11540 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011541 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011542 letters, digits, underscore). However, given the current
11543 definition of XID_Start and XID_Continue, it is sufficient
11544 to check just for these, except that _ must be allowed
11545 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011547 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011548 return 0;
11549
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011550 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011553 return 1;
11554}
11555
11556PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011558\n\
11559Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011560to the language definition.\n\
11561\n\
11562Use keyword.iskeyword() to test for reserved identifiers\n\
11563such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011564
11565static PyObject*
11566unicode_isidentifier(PyObject *self)
11567{
11568 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11569}
11570
Georg Brandl559e5d72008-06-11 18:37:52 +000011571PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011573\n\
11574Return True if all characters in S are considered\n\
11575printable in repr() or S is empty, False otherwise.");
11576
11577static PyObject*
11578unicode_isprintable(PyObject *self)
11579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 Py_ssize_t i, length;
11581 int kind;
11582 void *data;
11583
11584 if (PyUnicode_READY(self) == -1)
11585 return NULL;
11586 length = PyUnicode_GET_LENGTH(self);
11587 kind = PyUnicode_KIND(self);
11588 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011589
11590 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 if (length == 1)
11592 return PyBool_FromLong(
11593 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 for (i = 0; i < length; i++) {
11596 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011597 Py_RETURN_FALSE;
11598 }
11599 }
11600 Py_RETURN_TRUE;
11601}
11602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011604 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605\n\
11606Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011607iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
11609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011610unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011612 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613}
11614
Martin v. Löwis18e16552006-02-15 17:27:45 +000011615static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011616unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 if (PyUnicode_READY(self) == -1)
11619 return -1;
11620 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011626Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011627done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011630unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011632 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 Py_UCS4 fillchar = ' ';
11634
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011635 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636 return NULL;
11637
Benjamin Petersonbac79492012-01-14 13:34:47 -050011638 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Victor Stinnerc4b49542011-12-11 22:44:26 +010011641 if (PyUnicode_GET_LENGTH(self) >= width)
11642 return unicode_result_unchanged(self);
11643
11644 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645}
11646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011647PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
11652static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011653unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011655 if (PyUnicode_READY(self) == -1)
11656 return NULL;
11657 if (PyUnicode_IS_ASCII(self))
11658 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011659 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011662#define LEFTSTRIP 0
11663#define RIGHTSTRIP 1
11664#define BOTHSTRIP 2
11665
11666/* Arrays indexed by above */
11667static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11668
11669#define STRIPNAME(i) (stripformat[i]+3)
11670
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011671/* externally visible for str.strip(unicode) */
11672PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011673_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 void *data;
11676 int kind;
11677 Py_ssize_t i, j, len;
11678 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11681 return NULL;
11682
11683 kind = PyUnicode_KIND(self);
11684 data = PyUnicode_DATA(self);
11685 len = PyUnicode_GET_LENGTH(self);
11686 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11687 PyUnicode_DATA(sepobj),
11688 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011689
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 i = 0;
11691 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 while (i < len &&
11693 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 i++;
11695 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011696 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 j = len;
11699 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 do {
11701 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 } while (j >= i &&
11703 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011706
Victor Stinner7931d9a2011-11-04 00:22:48 +010011707 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708}
11709
11710PyObject*
11711PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11712{
11713 unsigned char *data;
11714 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011715 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716
Victor Stinnerde636f32011-10-01 03:55:54 +020011717 if (PyUnicode_READY(self) == -1)
11718 return NULL;
11719
Victor Stinner684d5fd2012-05-03 02:32:34 +020011720 length = PyUnicode_GET_LENGTH(self);
11721 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011722
Victor Stinner684d5fd2012-05-03 02:32:34 +020011723 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011724 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725
Victor Stinnerde636f32011-10-01 03:55:54 +020011726 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011727 PyErr_SetString(PyExc_IndexError, "string index out of range");
11728 return NULL;
11729 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011730 if (start >= length || end < start)
11731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011732
Victor Stinner684d5fd2012-05-03 02:32:34 +020011733 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011734 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011735 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011736 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011737 }
11738 else {
11739 kind = PyUnicode_KIND(self);
11740 data = PyUnicode_1BYTE_DATA(self);
11741 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011742 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011743 length);
11744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
11747static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011748do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 int kind;
11751 void *data;
11752 Py_ssize_t len, i, j;
11753
11754 if (PyUnicode_READY(self) == -1)
11755 return NULL;
11756
11757 kind = PyUnicode_KIND(self);
11758 data = PyUnicode_DATA(self);
11759 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011760
Benjamin Peterson14339b62009-01-31 16:36:08 +000011761 i = 0;
11762 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011764 i++;
11765 }
11766 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011767
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 j = len;
11769 if (striptype != LEFTSTRIP) {
11770 do {
11771 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773 j++;
11774 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775
Victor Stinner7931d9a2011-11-04 00:22:48 +010011776 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777}
11778
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779
11780static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011781do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784
Benjamin Peterson14339b62009-01-31 16:36:08 +000011785 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11786 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 if (sep != NULL && sep != Py_None) {
11789 if (PyUnicode_Check(sep))
11790 return _PyUnicode_XStrip(self, striptype, sep);
11791 else {
11792 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 "%s arg must be None or str",
11794 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 return NULL;
11796 }
11797 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798
Benjamin Peterson14339b62009-01-31 16:36:08 +000011799 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800}
11801
11802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011803PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011805\n\
11806Return a copy of the string S with leading and trailing\n\
11807whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011808If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011809
11810static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011811unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011813 if (PyTuple_GET_SIZE(args) == 0)
11814 return do_strip(self, BOTHSTRIP); /* Common case */
11815 else
11816 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817}
11818
11819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011820PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011822\n\
11823Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011824If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825
11826static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011827unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011829 if (PyTuple_GET_SIZE(args) == 0)
11830 return do_strip(self, LEFTSTRIP); /* Common case */
11831 else
11832 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833}
11834
11835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011838\n\
11839Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011840If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011841
11842static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011843unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011845 if (PyTuple_GET_SIZE(args) == 0)
11846 return do_strip(self, RIGHTSTRIP); /* Common case */
11847 else
11848 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011849}
11850
11851
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011853unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011855 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Serhiy Storchaka05997252013-01-26 12:14:02 +020011858 if (len < 1)
11859 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
Victor Stinnerc4b49542011-12-11 22:44:26 +010011861 /* no repeat, return original string */
11862 if (len == 1)
11863 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011864
Benjamin Petersonbac79492012-01-14 13:34:47 -050011865 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 return NULL;
11867
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011868 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011869 PyErr_SetString(PyExc_OverflowError,
11870 "repeated string is too long");
11871 return NULL;
11872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011874
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011875 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 if (!u)
11877 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011878 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 if (PyUnicode_GET_LENGTH(str) == 1) {
11881 const int kind = PyUnicode_KIND(str);
11882 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011883 if (kind == PyUnicode_1BYTE_KIND) {
11884 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011885 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011886 }
11887 else if (kind == PyUnicode_2BYTE_KIND) {
11888 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011889 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011890 ucs2[n] = fill_char;
11891 } else {
11892 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11893 assert(kind == PyUnicode_4BYTE_KIND);
11894 for (n = 0; n < len; ++n)
11895 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 }
11898 else {
11899 /* number of characters copied this far */
11900 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011901 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 char *to = (char *) PyUnicode_DATA(u);
11903 Py_MEMCPY(to, PyUnicode_DATA(str),
11904 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 n = (done <= nchars-done) ? done : nchars-done;
11907 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011908 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 }
11911
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011912 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011913 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914}
11915
Alexander Belopolsky40018472011-02-26 01:02:56 +000011916PyObject *
11917PyUnicode_Replace(PyObject *obj,
11918 PyObject *subobj,
11919 PyObject *replobj,
11920 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921{
11922 PyObject *self;
11923 PyObject *str1;
11924 PyObject *str2;
11925 PyObject *result;
11926
11927 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011928 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011931 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 Py_DECREF(self);
11933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
11935 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011936 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 Py_DECREF(self);
11938 Py_DECREF(str1);
11939 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011941 if (PyUnicode_READY(self) == -1 ||
11942 PyUnicode_READY(str1) == -1 ||
11943 PyUnicode_READY(str2) == -1)
11944 result = NULL;
11945 else
11946 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 Py_DECREF(self);
11948 Py_DECREF(str1);
11949 Py_DECREF(str2);
11950 return result;
11951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011954 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
11956Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011957old replaced by new. If the optional argument count is\n\
11958given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959
11960static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 PyObject *str1;
11964 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011965 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 PyObject *result;
11967
Martin v. Löwis18e16552006-02-15 17:27:45 +000011968 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011970 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011973 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 return NULL;
11975 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011976 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 Py_DECREF(str1);
11978 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011979 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011980 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11981 result = NULL;
11982 else
11983 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
11985 Py_DECREF(str1);
11986 Py_DECREF(str2);
11987 return result;
11988}
11989
Alexander Belopolsky40018472011-02-26 01:02:56 +000011990static PyObject *
11991unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011993 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 Py_ssize_t isize;
11995 Py_ssize_t osize, squote, dquote, i, o;
11996 Py_UCS4 max, quote;
11997 int ikind, okind;
11998 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012001 return NULL;
12002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 isize = PyUnicode_GET_LENGTH(unicode);
12004 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 /* Compute length of output, quote characters, and
12007 maximum character */
12008 osize = 2; /* quotes */
12009 max = 127;
12010 squote = dquote = 0;
12011 ikind = PyUnicode_KIND(unicode);
12012 for (i = 0; i < isize; i++) {
12013 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012014 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012016 case '\'': squote++; break;
12017 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012019 incr = 2;
12020 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 default:
12022 /* Fast-path ASCII */
12023 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012024 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012026 ;
12027 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012030 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012032 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012034 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012036 if (osize > PY_SSIZE_T_MAX - incr) {
12037 PyErr_SetString(PyExc_OverflowError,
12038 "string is too long to generate repr");
12039 return NULL;
12040 }
12041 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 }
12043
12044 quote = '\'';
12045 if (squote) {
12046 if (dquote)
12047 /* Both squote and dquote present. Use squote,
12048 and escape them */
12049 osize += squote;
12050 else
12051 quote = '"';
12052 }
12053
12054 repr = PyUnicode_New(osize, max);
12055 if (repr == NULL)
12056 return NULL;
12057 okind = PyUnicode_KIND(repr);
12058 odata = PyUnicode_DATA(repr);
12059
12060 PyUnicode_WRITE(okind, odata, 0, quote);
12061 PyUnicode_WRITE(okind, odata, osize-1, quote);
12062
12063 for (i = 0, o = 1; i < isize; i++) {
12064 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012065
12066 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if ((ch == quote) || (ch == '\\')) {
12068 PyUnicode_WRITE(okind, odata, o++, '\\');
12069 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012070 continue;
12071 }
12072
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012074 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 PyUnicode_WRITE(okind, odata, o++, '\\');
12076 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012077 }
12078 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 }
12082 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 PyUnicode_WRITE(okind, odata, o++, '\\');
12084 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012085 }
12086
12087 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012088 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 PyUnicode_WRITE(okind, odata, o++, '\\');
12090 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012093 }
12094
Georg Brandl559e5d72008-06-11 18:37:52 +000012095 /* Copy ASCII characters as-is */
12096 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 }
12099
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012101 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 (categories Z* and C* except ASCII space)
12104 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012106 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012107 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012112 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012113 /* Map 16-bit characters to '\uxxxx' */
12114 else if (ch <= 0xffff) {
12115 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012120 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012121 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012122 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012123 PyUnicode_WRITE(okind, odata, o++, 'U');
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12127 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12130 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12131 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012132 }
12133 }
12134 /* Copy characters as-is */
12135 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012137 }
12138 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012141 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012142 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143}
12144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147\n\
12148Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012149such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150arguments start and end are interpreted as in slice notation.\n\
12151\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
12154static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012157 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012158 Py_ssize_t start;
12159 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012160 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
Jesus Ceaac451502011-04-20 17:09:23 +020012162 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12163 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (PyUnicode_READY(self) == -1)
12167 return NULL;
12168 if (PyUnicode_READY(substring) == -1)
12169 return NULL;
12170
Victor Stinner7931d9a2011-11-04 00:22:48 +010012171 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172
12173 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 if (result == -2)
12176 return NULL;
12177
Christian Heimes217cfd12007-12-02 14:31:20 +000012178 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179}
12180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012181PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012184Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185
12186static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012189 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012190 Py_ssize_t start;
12191 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012192 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193
Jesus Ceaac451502011-04-20 17:09:23 +020012194 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12195 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 if (PyUnicode_READY(self) == -1)
12199 return NULL;
12200 if (PyUnicode_READY(substring) == -1)
12201 return NULL;
12202
Victor Stinner7931d9a2011-11-04 00:22:48 +010012203 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204
12205 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 if (result == -2)
12208 return NULL;
12209
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 if (result < 0) {
12211 PyErr_SetString(PyExc_ValueError, "substring not found");
12212 return NULL;
12213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214
Christian Heimes217cfd12007-12-02 14:31:20 +000012215 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216}
12217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012218PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012221Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012222done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
12224static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012225unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012227 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 Py_UCS4 fillchar = ' ';
12229
Victor Stinnere9a29352011-10-01 02:14:59 +020012230 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012232
Benjamin Petersonbac79492012-01-14 13:34:47 -050012233 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 return NULL;
12235
Victor Stinnerc4b49542011-12-11 22:44:26 +010012236 if (PyUnicode_GET_LENGTH(self) >= width)
12237 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
Victor Stinnerc4b49542011-12-11 22:44:26 +010012239 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
Alexander Belopolsky40018472011-02-26 01:02:56 +000012242PyObject *
12243PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
12245 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012246
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 s = PyUnicode_FromObject(s);
12248 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 if (sep != NULL) {
12251 sep = PyUnicode_FromObject(sep);
12252 if (sep == NULL) {
12253 Py_DECREF(s);
12254 return NULL;
12255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 }
12257
Victor Stinner9310abb2011-10-05 00:59:23 +020012258 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260 Py_DECREF(s);
12261 Py_XDECREF(sep);
12262 return result;
12263}
12264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012265PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012266 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267\n\
12268Return a list of the words in S, using sep as the\n\
12269delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012270splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012271whitespace string is a separator and empty strings are\n\
12272removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273
12274static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012275unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012277 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012279 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012281 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12282 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 return NULL;
12284
12285 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012288 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012290 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291}
12292
Thomas Wouters477c8d52006-05-27 19:21:47 +000012293PyObject *
12294PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12295{
12296 PyObject* str_obj;
12297 PyObject* sep_obj;
12298 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 int kind1, kind2, kind;
12300 void *buf1 = NULL, *buf2 = NULL;
12301 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012302
12303 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012304 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012307 if (!sep_obj) {
12308 Py_DECREF(str_obj);
12309 return NULL;
12310 }
12311 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12312 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313 Py_DECREF(str_obj);
12314 return NULL;
12315 }
12316
Victor Stinner14f8f022011-10-05 20:58:25 +020012317 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012319 kind = Py_MAX(kind1, kind2);
12320 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012322 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (!buf1)
12324 goto onError;
12325 buf2 = PyUnicode_DATA(sep_obj);
12326 if (kind2 != kind)
12327 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12328 if (!buf2)
12329 goto onError;
12330 len1 = PyUnicode_GET_LENGTH(str_obj);
12331 len2 = PyUnicode_GET_LENGTH(sep_obj);
12332
Benjamin Petersonead6b532011-12-20 17:23:42 -060012333 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012335 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12336 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12337 else
12338 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 break;
12340 case PyUnicode_2BYTE_KIND:
12341 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12342 break;
12343 case PyUnicode_4BYTE_KIND:
12344 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12345 break;
12346 default:
12347 assert(0);
12348 out = 0;
12349 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350
12351 Py_DECREF(sep_obj);
12352 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 if (kind1 != kind)
12354 PyMem_Free(buf1);
12355 if (kind2 != kind)
12356 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357
12358 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 onError:
12360 Py_DECREF(sep_obj);
12361 Py_DECREF(str_obj);
12362 if (kind1 != kind && buf1)
12363 PyMem_Free(buf1);
12364 if (kind2 != kind && buf2)
12365 PyMem_Free(buf2);
12366 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012367}
12368
12369
12370PyObject *
12371PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12372{
12373 PyObject* str_obj;
12374 PyObject* sep_obj;
12375 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 int kind1, kind2, kind;
12377 void *buf1 = NULL, *buf2 = NULL;
12378 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379
12380 str_obj = PyUnicode_FromObject(str_in);
12381 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012382 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383 sep_obj = PyUnicode_FromObject(sep_in);
12384 if (!sep_obj) {
12385 Py_DECREF(str_obj);
12386 return NULL;
12387 }
12388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 kind1 = PyUnicode_KIND(str_in);
12390 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012391 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 buf1 = PyUnicode_DATA(str_in);
12393 if (kind1 != kind)
12394 buf1 = _PyUnicode_AsKind(str_in, kind);
12395 if (!buf1)
12396 goto onError;
12397 buf2 = PyUnicode_DATA(sep_obj);
12398 if (kind2 != kind)
12399 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12400 if (!buf2)
12401 goto onError;
12402 len1 = PyUnicode_GET_LENGTH(str_obj);
12403 len2 = PyUnicode_GET_LENGTH(sep_obj);
12404
Benjamin Petersonead6b532011-12-20 17:23:42 -060012405 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012407 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12408 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12409 else
12410 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 break;
12412 case PyUnicode_2BYTE_KIND:
12413 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12414 break;
12415 case PyUnicode_4BYTE_KIND:
12416 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12417 break;
12418 default:
12419 assert(0);
12420 out = 0;
12421 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422
12423 Py_DECREF(sep_obj);
12424 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (kind1 != kind)
12426 PyMem_Free(buf1);
12427 if (kind2 != kind)
12428 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429
12430 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 onError:
12432 Py_DECREF(sep_obj);
12433 Py_DECREF(str_obj);
12434 if (kind1 != kind && buf1)
12435 PyMem_Free(buf1);
12436 if (kind2 != kind && buf2)
12437 PyMem_Free(buf2);
12438 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439}
12440
12441PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012444Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012446found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012447
12448static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012449unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012450{
Victor Stinner9310abb2011-10-05 00:59:23 +020012451 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452}
12453
12454PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012455 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012457Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012459separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460
12461static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012462unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012463{
Victor Stinner9310abb2011-10-05 00:59:23 +020012464 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012465}
12466
Alexander Belopolsky40018472011-02-26 01:02:56 +000012467PyObject *
12468PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469{
12470 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012472 s = PyUnicode_FromObject(s);
12473 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012474 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 if (sep != NULL) {
12476 sep = PyUnicode_FromObject(sep);
12477 if (sep == NULL) {
12478 Py_DECREF(s);
12479 return NULL;
12480 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012481 }
12482
Victor Stinner9310abb2011-10-05 00:59:23 +020012483 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012484
12485 Py_DECREF(s);
12486 Py_XDECREF(sep);
12487 return result;
12488}
12489
12490PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012491 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012492\n\
12493Return a list of the words in S, using sep as the\n\
12494delimiter string, starting at the end of the string and\n\
12495working to the front. If maxsplit is given, at most maxsplit\n\
12496splits are done. If sep is not specified, any whitespace string\n\
12497is a separator.");
12498
12499static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012500unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012501{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012502 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012504 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012506 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12507 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012508 return NULL;
12509
12510 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012512 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012513 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012514 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012515 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012516}
12517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012518PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012519 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520\n\
12521Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012522Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012523is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
12525static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012528 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012529 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012531 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12532 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533 return NULL;
12534
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012535 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536}
12537
12538static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012539PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012541 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542}
12543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012544PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546\n\
12547Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012548and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
12550static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012551unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012553 if (PyUnicode_READY(self) == -1)
12554 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012555 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556}
12557
Georg Brandlceee0772007-11-27 23:48:05 +000012558PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012560\n\
12561Return a translation table usable for str.translate().\n\
12562If there is only one argument, it must be a dictionary mapping Unicode\n\
12563ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012564Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012565If there are two arguments, they must be strings of equal length, and\n\
12566in the resulting dictionary, each character in x will be mapped to the\n\
12567character at the same position in y. If there is a third argument, it\n\
12568must be a string, whose characters will be mapped to None in the result.");
12569
12570static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012571unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012572{
12573 PyObject *x, *y = NULL, *z = NULL;
12574 PyObject *new = NULL, *key, *value;
12575 Py_ssize_t i = 0;
12576 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577
Georg Brandlceee0772007-11-27 23:48:05 +000012578 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12579 return NULL;
12580 new = PyDict_New();
12581 if (!new)
12582 return NULL;
12583 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 int x_kind, y_kind, z_kind;
12585 void *x_data, *y_data, *z_data;
12586
Georg Brandlceee0772007-11-27 23:48:05 +000012587 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012588 if (!PyUnicode_Check(x)) {
12589 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12590 "be a string if there is a second argument");
12591 goto err;
12592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012594 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12595 "arguments must have equal length");
12596 goto err;
12597 }
12598 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 x_kind = PyUnicode_KIND(x);
12600 y_kind = PyUnicode_KIND(y);
12601 x_data = PyUnicode_DATA(x);
12602 y_data = PyUnicode_DATA(y);
12603 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12604 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012605 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012606 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012607 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012608 if (!value) {
12609 Py_DECREF(key);
12610 goto err;
12611 }
Georg Brandlceee0772007-11-27 23:48:05 +000012612 res = PyDict_SetItem(new, key, value);
12613 Py_DECREF(key);
12614 Py_DECREF(value);
12615 if (res < 0)
12616 goto err;
12617 }
12618 /* create entries for deleting chars in z */
12619 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 z_kind = PyUnicode_KIND(z);
12621 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012622 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012624 if (!key)
12625 goto err;
12626 res = PyDict_SetItem(new, key, Py_None);
12627 Py_DECREF(key);
12628 if (res < 0)
12629 goto err;
12630 }
12631 }
12632 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 int kind;
12634 void *data;
12635
Georg Brandlceee0772007-11-27 23:48:05 +000012636 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012637 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012638 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12639 "to maketrans it must be a dict");
12640 goto err;
12641 }
12642 /* copy entries into the new dict, converting string keys to int keys */
12643 while (PyDict_Next(x, &i, &key, &value)) {
12644 if (PyUnicode_Check(key)) {
12645 /* convert string keys to integer keys */
12646 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012647 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012648 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12649 "table must be of length 1");
12650 goto err;
12651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 kind = PyUnicode_KIND(key);
12653 data = PyUnicode_DATA(key);
12654 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012655 if (!newkey)
12656 goto err;
12657 res = PyDict_SetItem(new, newkey, value);
12658 Py_DECREF(newkey);
12659 if (res < 0)
12660 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012661 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012662 /* just keep integer keys */
12663 if (PyDict_SetItem(new, key, value) < 0)
12664 goto err;
12665 } else {
12666 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12667 "be strings or integers");
12668 goto err;
12669 }
12670 }
12671 }
12672 return new;
12673 err:
12674 Py_DECREF(new);
12675 return NULL;
12676}
12677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012678PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680\n\
12681Return a copy of the string S, where all characters have been mapped\n\
12682through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012683Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012684Unmapped characters are left untouched. Characters mapped to None\n\
12685are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686
12687static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691}
12692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012693PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012696Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
12698static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012699unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012701 if (PyUnicode_READY(self) == -1)
12702 return NULL;
12703 if (PyUnicode_IS_ASCII(self))
12704 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012705 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706}
12707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012708PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012711Pad a numeric string S with zeros on the left, to fill a field\n\
12712of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713
12714static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012715unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012717 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012718 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012719 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 int kind;
12721 void *data;
12722 Py_UCS4 chr;
12723
Martin v. Löwis18e16552006-02-15 17:27:45 +000012724 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725 return NULL;
12726
Benjamin Petersonbac79492012-01-14 13:34:47 -050012727 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
Victor Stinnerc4b49542011-12-11 22:44:26 +010012730 if (PyUnicode_GET_LENGTH(self) >= width)
12731 return unicode_result_unchanged(self);
12732
12733 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
12735 u = pad(self, fill, 0, '0');
12736
Walter Dörwald068325e2002-04-15 13:36:47 +000012737 if (u == NULL)
12738 return NULL;
12739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 kind = PyUnicode_KIND(u);
12741 data = PyUnicode_DATA(u);
12742 chr = PyUnicode_READ(kind, data, fill);
12743
12744 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 PyUnicode_WRITE(kind, data, 0, chr);
12747 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748 }
12749
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012750 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012751 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753
12754#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012755static PyObject *
12756unicode__decimal2ascii(PyObject *self)
12757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012759}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760#endif
12761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012762PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012765Return True if S starts with the specified prefix, False otherwise.\n\
12766With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767With optional end, stop comparing S at that position.\n\
12768prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769
12770static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012771unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012775 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012776 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012777 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
Jesus Ceaac451502011-04-20 17:09:23 +020012780 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782 if (PyTuple_Check(subobj)) {
12783 Py_ssize_t i;
12784 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012785 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012786 if (substring == NULL)
12787 return NULL;
12788 result = tailmatch(self, substring, start, end, -1);
12789 Py_DECREF(substring);
12790 if (result) {
12791 Py_RETURN_TRUE;
12792 }
12793 }
12794 /* nothing matched */
12795 Py_RETURN_FALSE;
12796 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012797 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012798 if (substring == NULL) {
12799 if (PyErr_ExceptionMatches(PyExc_TypeError))
12800 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12801 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012803 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012806 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807}
12808
12809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012810PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012813Return True if S ends with the specified suffix, False otherwise.\n\
12814With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815With optional end, stop comparing S at that position.\n\
12816suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012819unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012823 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012824 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012825 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012826 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827
Jesus Ceaac451502011-04-20 17:09:23 +020012828 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 if (PyTuple_Check(subobj)) {
12831 Py_ssize_t i;
12832 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012833 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012835 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012837 result = tailmatch(self, substring, start, end, +1);
12838 Py_DECREF(substring);
12839 if (result) {
12840 Py_RETURN_TRUE;
12841 }
12842 }
12843 Py_RETURN_FALSE;
12844 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012845 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012846 if (substring == NULL) {
12847 if (PyErr_ExceptionMatches(PyExc_TypeError))
12848 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12849 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012851 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012852 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012854 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855}
12856
Victor Stinner202fdca2012-05-07 12:47:02 +020012857Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012858_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012859{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012860 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012861 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12862 writer->data = PyUnicode_DATA(writer->buffer);
12863 writer->kind = PyUnicode_KIND(writer->buffer);
12864}
12865
Victor Stinnerd3f08822012-05-29 12:57:52 +020012866void
12867_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012868{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012869 memset(writer, 0, sizeof(*writer));
12870#ifdef Py_DEBUG
12871 writer->kind = 5; /* invalid kind */
12872#endif
12873 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012874 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012875}
12876
Victor Stinnerd3f08822012-05-29 12:57:52 +020012877int
12878_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12879 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012880{
12881 Py_ssize_t newlen;
12882 PyObject *newbuffer;
12883
Victor Stinnerd3f08822012-05-29 12:57:52 +020012884 assert(length > 0);
12885
Victor Stinner202fdca2012-05-07 12:47:02 +020012886 if (length > PY_SSIZE_T_MAX - writer->pos) {
12887 PyErr_NoMemory();
12888 return -1;
12889 }
12890 newlen = writer->pos + length;
12891
Victor Stinnerd3f08822012-05-29 12:57:52 +020012892 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012893 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012894 /* overallocate 25% to limit the number of resize */
12895 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12896 newlen += newlen / 4;
12897 if (newlen < writer->min_length)
12898 newlen = writer->min_length;
12899 }
12900 writer->buffer = PyUnicode_New(newlen, maxchar);
12901 if (writer->buffer == NULL)
12902 return -1;
12903 _PyUnicodeWriter_Update(writer);
12904 return 0;
12905 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012906
Victor Stinnerd3f08822012-05-29 12:57:52 +020012907 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012908 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012909 /* overallocate 25% to limit the number of resize */
12910 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12911 newlen += newlen / 4;
12912 if (newlen < writer->min_length)
12913 newlen = writer->min_length;
12914 }
12915
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012916 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012917 /* resize + widen */
12918 newbuffer = PyUnicode_New(newlen, maxchar);
12919 if (newbuffer == NULL)
12920 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012921 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12922 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012923 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012924 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012925 }
12926 else {
12927 newbuffer = resize_compact(writer->buffer, newlen);
12928 if (newbuffer == NULL)
12929 return -1;
12930 }
12931 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012932 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012933 }
12934 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012935 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012936 newbuffer = PyUnicode_New(writer->size, maxchar);
12937 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012938 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012939 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12940 writer->buffer, 0, writer->pos);
12941 Py_DECREF(writer->buffer);
12942 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012943 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012944 }
12945 return 0;
12946}
12947
Victor Stinnerd3f08822012-05-29 12:57:52 +020012948int
12949_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12950{
12951 Py_UCS4 maxchar;
12952 Py_ssize_t len;
12953
12954 if (PyUnicode_READY(str) == -1)
12955 return -1;
12956 len = PyUnicode_GET_LENGTH(str);
12957 if (len == 0)
12958 return 0;
12959 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12960 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012961 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012962 Py_INCREF(str);
12963 writer->buffer = str;
12964 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012965 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012966 writer->size = 0;
12967 writer->pos += len;
12968 return 0;
12969 }
12970 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12971 return -1;
12972 }
12973 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12974 str, 0, len);
12975 writer->pos += len;
12976 return 0;
12977}
12978
12979PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012980_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012981{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012982 if (writer->pos == 0) {
12983 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012984 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012985 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012986 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012987 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12988 return writer->buffer;
12989 }
12990 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12991 PyObject *newbuffer;
12992 newbuffer = resize_compact(writer->buffer, writer->pos);
12993 if (newbuffer == NULL) {
12994 Py_DECREF(writer->buffer);
12995 return NULL;
12996 }
12997 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012998 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012999 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010013000 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013001}
13002
Victor Stinnerd3f08822012-05-29 12:57:52 +020013003void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013004_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013005{
13006 Py_CLEAR(writer->buffer);
13007}
13008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013010
13011PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013013\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013014Return a formatted version of S, using substitutions from args and kwargs.\n\
13015The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013016
Eric Smith27bbca62010-11-04 17:06:58 +000013017PyDoc_STRVAR(format_map__doc__,
13018 "S.format_map(mapping) -> str\n\
13019\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013020Return a formatted version of S, using substitutions from mapping.\n\
13021The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013022
Eric Smith4a7d76d2008-05-30 18:10:19 +000013023static PyObject *
13024unicode__format__(PyObject* self, PyObject* args)
13025{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013026 PyObject *format_spec;
13027 _PyUnicodeWriter writer;
13028 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013029
13030 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13031 return NULL;
13032
Victor Stinnerd3f08822012-05-29 12:57:52 +020013033 if (PyUnicode_READY(self) == -1)
13034 return NULL;
13035 _PyUnicodeWriter_Init(&writer, 0);
13036 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13037 self, format_spec, 0,
13038 PyUnicode_GET_LENGTH(format_spec));
13039 if (ret == -1) {
13040 _PyUnicodeWriter_Dealloc(&writer);
13041 return NULL;
13042 }
13043 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013044}
13045
Eric Smith8c663262007-08-25 02:26:07 +000013046PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013047 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013048\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013049Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013050
13051static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013052unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 Py_ssize_t size;
13055
13056 /* If it's a compact object, account for base structure +
13057 character data. */
13058 if (PyUnicode_IS_COMPACT_ASCII(v))
13059 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13060 else if (PyUnicode_IS_COMPACT(v))
13061 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013062 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 else {
13064 /* If it is a two-block object, account for base object, and
13065 for character block if present. */
13066 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013067 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013069 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 }
13071 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013072 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013073 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013075 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013076 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077
13078 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013079}
13080
13081PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013083
13084static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013085unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013086{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013087 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088 if (!copy)
13089 return NULL;
13090 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013091}
13092
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013094 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013095 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013096 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13097 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013098 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13099 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013100 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013101 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13102 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13103 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13104 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13105 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013106 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013107 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13108 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13109 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013110 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013111 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13112 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13113 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013114 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013115 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013116 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013117 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013118 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13119 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13120 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13121 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13122 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13123 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13124 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13125 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13126 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13127 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13128 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13129 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13130 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13131 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013132 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013133 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013134 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013135 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013136 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013137 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013138 {"maketrans", (PyCFunction) unicode_maketrans,
13139 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013140 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013141#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013142 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013143 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144#endif
13145
Benjamin Peterson14339b62009-01-31 16:36:08 +000013146 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147 {NULL, NULL}
13148};
13149
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013150static PyObject *
13151unicode_mod(PyObject *v, PyObject *w)
13152{
Brian Curtindfc80e32011-08-10 20:28:54 -050013153 if (!PyUnicode_Check(v))
13154 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013156}
13157
13158static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 0, /*nb_add*/
13160 0, /*nb_subtract*/
13161 0, /*nb_multiply*/
13162 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013163};
13164
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 (lenfunc) unicode_length, /* sq_length */
13167 PyUnicode_Concat, /* sq_concat */
13168 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13169 (ssizeargfunc) unicode_getitem, /* sq_item */
13170 0, /* sq_slice */
13171 0, /* sq_ass_item */
13172 0, /* sq_ass_slice */
13173 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174};
13175
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013176static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013177unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013178{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 if (PyUnicode_READY(self) == -1)
13180 return NULL;
13181
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013182 if (PyIndex_Check(item)) {
13183 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013184 if (i == -1 && PyErr_Occurred())
13185 return NULL;
13186 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013188 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013189 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013190 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013191 PyObject *result;
13192 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013193 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013194 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013198 return NULL;
13199 }
13200
13201 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013202 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013204 slicelength == PyUnicode_GET_LENGTH(self)) {
13205 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013206 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013207 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013208 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013209 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013210 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013211 src_kind = PyUnicode_KIND(self);
13212 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013213 if (!PyUnicode_IS_ASCII(self)) {
13214 kind_limit = kind_maxchar_limit(src_kind);
13215 max_char = 0;
13216 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13217 ch = PyUnicode_READ(src_kind, src_data, cur);
13218 if (ch > max_char) {
13219 max_char = ch;
13220 if (max_char >= kind_limit)
13221 break;
13222 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013223 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013224 }
Victor Stinner55c99112011-10-13 01:17:06 +020013225 else
13226 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013227 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013228 if (result == NULL)
13229 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013230 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013231 dest_data = PyUnicode_DATA(result);
13232
13233 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013234 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13235 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013236 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013237 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013238 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013239 } else {
13240 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13241 return NULL;
13242 }
13243}
13244
13245static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013246 (lenfunc)unicode_length, /* mp_length */
13247 (binaryfunc)unicode_subscript, /* mp_subscript */
13248 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013249};
13250
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252/* Helpers for PyUnicode_Format() */
13253
13254static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013255getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013257 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 (*p_argidx)++;
13260 if (arglen < 0)
13261 return args;
13262 else
13263 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264 }
13265 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013266 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267 return NULL;
13268}
13269
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013270/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272static int
13273formatfloat(PyObject *v, int flags, int prec, int type,
13274 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013275{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013276 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013278 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013279
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280 x = PyFloat_AsDouble(v);
13281 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013282 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013283
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013286
Eric Smith0923d1d2009-04-16 20:16:10 +000013287 p = PyOS_double_to_string(x, type, prec,
13288 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013289 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013290 return -1;
13291 len = strlen(p);
13292 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013293 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13294 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013295 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013296 }
Victor Stinner184252a2012-06-16 02:57:41 +020013297 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013298 writer->pos += len;
13299 }
13300 else
13301 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013302 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013303 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304}
13305
Victor Stinnerd0880d52012-04-27 23:40:13 +020013306/* formatlong() emulates the format codes d, u, o, x and X, and
13307 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13308 * Python's regular ints.
13309 * Return value: a new PyUnicodeObject*, or NULL if error.
13310 * The output string is of the form
13311 * "-"? ("0x" | "0X")? digit+
13312 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13313 * set in flags. The case of hex digits will be correct,
13314 * There will be at least prec digits, zero-filled on the left if
13315 * necessary to get that many.
13316 * val object to be converted
13317 * flags bitmask of format flags; only F_ALT is looked at
13318 * prec minimum number of digits; 0-fill on left if needed
13319 * type a character in [duoxX]; u acts the same as d
13320 *
13321 * CAUTION: o, x and X conversions on regular ints can never
13322 * produce a '-' sign, but can for Python's unbounded ints.
13323 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013324static PyObject*
13325formatlong(PyObject *val, int flags, int prec, int type)
13326{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013327 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013328 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013329 Py_ssize_t i;
13330 int sign; /* 1 if '-', else 0 */
13331 int len; /* number of characters */
13332 Py_ssize_t llen;
13333 int numdigits; /* len == numnondigits + numdigits */
13334 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013335
Victor Stinnerd0880d52012-04-27 23:40:13 +020013336 /* Avoid exceeding SSIZE_T_MAX */
13337 if (prec > INT_MAX-3) {
13338 PyErr_SetString(PyExc_OverflowError,
13339 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013340 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013341 }
13342
13343 assert(PyLong_Check(val));
13344
13345 switch (type) {
13346 case 'd':
13347 case 'u':
13348 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013349 if (PyBool_Check(val))
13350 result = PyNumber_ToBase(val, 10);
13351 else
13352 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013353 break;
13354 case 'o':
13355 numnondigits = 2;
13356 result = PyNumber_ToBase(val, 8);
13357 break;
13358 case 'x':
13359 case 'X':
13360 numnondigits = 2;
13361 result = PyNumber_ToBase(val, 16);
13362 break;
13363 default:
13364 assert(!"'type' not in [duoxX]");
13365 }
13366 if (!result)
13367 return NULL;
13368
13369 assert(unicode_modifiable(result));
13370 assert(PyUnicode_IS_READY(result));
13371 assert(PyUnicode_IS_ASCII(result));
13372
13373 /* To modify the string in-place, there can only be one reference. */
13374 if (Py_REFCNT(result) != 1) {
13375 PyErr_BadInternalCall();
13376 return NULL;
13377 }
13378 buf = PyUnicode_DATA(result);
13379 llen = PyUnicode_GET_LENGTH(result);
13380 if (llen > INT_MAX) {
13381 PyErr_SetString(PyExc_ValueError,
13382 "string too large in _PyBytes_FormatLong");
13383 return NULL;
13384 }
13385 len = (int)llen;
13386 sign = buf[0] == '-';
13387 numnondigits += sign;
13388 numdigits = len - numnondigits;
13389 assert(numdigits > 0);
13390
13391 /* Get rid of base marker unless F_ALT */
13392 if (((flags & F_ALT) == 0 &&
13393 (type == 'o' || type == 'x' || type == 'X'))) {
13394 assert(buf[sign] == '0');
13395 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13396 buf[sign+1] == 'o');
13397 numnondigits -= 2;
13398 buf += 2;
13399 len -= 2;
13400 if (sign)
13401 buf[0] = '-';
13402 assert(len == numnondigits + numdigits);
13403 assert(numdigits > 0);
13404 }
13405
13406 /* Fill with leading zeroes to meet minimum width. */
13407 if (prec > numdigits) {
13408 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13409 numnondigits + prec);
13410 char *b1;
13411 if (!r1) {
13412 Py_DECREF(result);
13413 return NULL;
13414 }
13415 b1 = PyBytes_AS_STRING(r1);
13416 for (i = 0; i < numnondigits; ++i)
13417 *b1++ = *buf++;
13418 for (i = 0; i < prec - numdigits; i++)
13419 *b1++ = '0';
13420 for (i = 0; i < numdigits; i++)
13421 *b1++ = *buf++;
13422 *b1 = '\0';
13423 Py_DECREF(result);
13424 result = r1;
13425 buf = PyBytes_AS_STRING(result);
13426 len = numnondigits + prec;
13427 }
13428
13429 /* Fix up case for hex conversions. */
13430 if (type == 'X') {
13431 /* Need to convert all lower case letters to upper case.
13432 and need to convert 0x to 0X (and -0x to -0X). */
13433 for (i = 0; i < len; i++)
13434 if (buf[i] >= 'a' && buf[i] <= 'x')
13435 buf[i] -= 'a'-'A';
13436 }
13437 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13438 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013439 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013440 Py_DECREF(result);
13441 result = unicode;
13442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013443 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013444}
13445
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013446static Py_UCS4
13447formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013449 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013450 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013452 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 goto onError;
13455 }
13456 else {
13457 /* Integer input truncated to a character */
13458 long x;
13459 x = PyLong_AsLong(v);
13460 if (x == -1 && PyErr_Occurred())
13461 goto onError;
13462
Victor Stinner8faf8212011-12-08 22:14:11 +010013463 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 PyErr_SetString(PyExc_OverflowError,
13465 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013466 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 }
13468
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013469 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013470 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013471
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013473 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013474 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476}
13477
Alexander Belopolsky40018472011-02-26 01:02:56 +000013478PyObject *
13479PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013480{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013481 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013483 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013484 PyObject *temp = NULL;
13485 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013486 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013487 void *fmt;
13488 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013489 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013490 Py_ssize_t sublen;
13491 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013492
Guido van Rossumd57fd912000-03-10 22:53:23 +000013493 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 PyErr_BadInternalCall();
13495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013496 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013497 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013498 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013500 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013501 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013502 return NULL;
13503 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 fmt = PyUnicode_DATA(uformat);
13506 fmtkind = PyUnicode_KIND(uformat);
13507 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13508 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013509
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013511
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 arglen = PyTuple_Size(args);
13514 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515 }
13516 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 arglen = -1;
13518 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013520 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013522
13523 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013525 Py_ssize_t nonfmtpos;
13526 nonfmtpos = fmtpos++;
13527 while (fmtcnt >= 0 &&
13528 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13529 fmtpos++;
13530 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013531 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013532 if (fmtcnt < 0)
13533 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013534 sublen = fmtpos - nonfmtpos;
13535 maxchar = _PyUnicode_FindMaxChar(uformat,
13536 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013537 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013538 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013539
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13541 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013542 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013543 }
13544 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 /* Got a format specifier */
13546 int flags = 0;
13547 Py_ssize_t width = -1;
13548 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013549 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013550 Py_UCS4 fill;
13551 int sign;
13552 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 int isnumok;
13554 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013555 void *pbuf = NULL;
13556 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013557 Py_UCS4 bufmaxchar;
13558 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013560 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013561 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13562 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 Py_ssize_t keylen;
13565 PyObject *key;
13566 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013567
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 if (dict == NULL) {
13569 PyErr_SetString(PyExc_TypeError,
13570 "format requires a mapping");
13571 goto onError;
13572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013573 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013575 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013576 /* Skip over balanced parentheses */
13577 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013578 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13579 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013581 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013583 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013585 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 if (fmtcnt < 0 || pcount > 0) {
13587 PyErr_SetString(PyExc_ValueError,
13588 "incomplete format key");
13589 goto onError;
13590 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013591 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013592 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013593 if (key == NULL)
13594 goto onError;
13595 if (args_owned) {
13596 Py_DECREF(args);
13597 args_owned = 0;
13598 }
13599 args = PyObject_GetItem(dict, key);
13600 Py_DECREF(key);
13601 if (args == NULL) {
13602 goto onError;
13603 }
13604 args_owned = 1;
13605 arglen = -1;
13606 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013607 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013609 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13610 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 case '-': flags |= F_LJUST; continue;
13612 case '+': flags |= F_SIGN; continue;
13613 case ' ': flags |= F_BLANK; continue;
13614 case '#': flags |= F_ALT; continue;
13615 case '0': flags |= F_ZERO; continue;
13616 }
13617 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013618 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 if (c == '*') {
13620 v = getnextarg(args, arglen, &argidx);
13621 if (v == NULL)
13622 goto onError;
13623 if (!PyLong_Check(v)) {
13624 PyErr_SetString(PyExc_TypeError,
13625 "* wants int");
13626 goto onError;
13627 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013628 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013629 if (width == -1 && PyErr_Occurred())
13630 goto onError;
13631 if (width < 0) {
13632 flags |= F_LJUST;
13633 width = -width;
13634 }
13635 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013636 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 }
13638 else if (c >= '0' && c <= '9') {
13639 width = c - '0';
13640 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 if (c < '0' || c > '9')
13643 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013644 /* Since c is unsigned, the RHS would end up as unsigned,
13645 mixing signed and unsigned comparison. Since c is between
13646 '0' and '9', casting to int is safe. */
13647 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013648 PyErr_SetString(PyExc_ValueError,
13649 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013650 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013651 }
13652 width = width*10 + (c - '0');
13653 }
13654 }
13655 if (c == '.') {
13656 prec = 0;
13657 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 if (c == '*') {
13660 v = getnextarg(args, arglen, &argidx);
13661 if (v == NULL)
13662 goto onError;
13663 if (!PyLong_Check(v)) {
13664 PyErr_SetString(PyExc_TypeError,
13665 "* wants int");
13666 goto onError;
13667 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013668 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 if (prec == -1 && PyErr_Occurred())
13670 goto onError;
13671 if (prec < 0)
13672 prec = 0;
13673 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013675 }
13676 else if (c >= '0' && c <= '9') {
13677 prec = c - '0';
13678 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013680 if (c < '0' || c > '9')
13681 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013682 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013683 PyErr_SetString(PyExc_ValueError,
13684 "prec too big");
13685 goto onError;
13686 }
13687 prec = prec*10 + (c - '0');
13688 }
13689 }
13690 } /* prec */
13691 if (fmtcnt >= 0) {
13692 if (c == 'h' || c == 'l' || c == 'L') {
13693 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013694 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 }
13696 }
13697 if (fmtcnt < 0) {
13698 PyErr_SetString(PyExc_ValueError,
13699 "incomplete format");
13700 goto onError;
13701 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013702 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013703 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013704
13705 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013706 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013707 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013708 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13709 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013710 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013712
Victor Stinneraff3cc62012-04-30 05:19:21 +020013713 v = getnextarg(args, arglen, &argidx);
13714 if (v == NULL)
13715 goto onError;
13716
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013718 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013719 fill = ' ';
13720 switch (c) {
13721
Benjamin Peterson29060642009-01-31 22:14:21 +000013722 case 's':
13723 case 'r':
13724 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013725 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13726 /* Fast path */
13727 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13728 goto onError;
13729 goto nextarg;
13730 }
13731
Victor Stinner808fc0a2010-03-22 12:50:40 +000013732 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 temp = v;
13734 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013735 }
13736 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 if (c == 's')
13738 temp = PyObject_Str(v);
13739 else if (c == 'r')
13740 temp = PyObject_Repr(v);
13741 else
13742 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013744 break;
13745
13746 case 'i':
13747 case 'd':
13748 case 'u':
13749 case 'o':
13750 case 'x':
13751 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013752 if (PyLong_CheckExact(v)
13753 && width == -1 && prec == -1
13754 && !(flags & (F_SIGN | F_BLANK)))
13755 {
13756 /* Fast path */
13757 switch(c)
13758 {
13759 case 'd':
13760 case 'i':
13761 case 'u':
13762 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13763 goto onError;
13764 goto nextarg;
13765 case 'x':
13766 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13767 goto onError;
13768 goto nextarg;
13769 case 'o':
13770 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13771 goto onError;
13772 goto nextarg;
13773 default:
13774 break;
13775 }
13776 }
13777
Benjamin Peterson29060642009-01-31 22:14:21 +000013778 isnumok = 0;
13779 if (PyNumber_Check(v)) {
13780 PyObject *iobj=NULL;
13781
13782 if (PyLong_Check(v)) {
13783 iobj = v;
13784 Py_INCREF(iobj);
13785 }
13786 else {
13787 iobj = PyNumber_Long(v);
13788 }
13789 if (iobj!=NULL) {
13790 if (PyLong_Check(iobj)) {
13791 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013792 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013793 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 }
13796 else {
13797 Py_DECREF(iobj);
13798 }
13799 }
13800 }
13801 if (!isnumok) {
13802 PyErr_Format(PyExc_TypeError,
13803 "%%%c format: a number is required, "
13804 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13805 goto onError;
13806 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013807 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 fill = '0';
13809 break;
13810
13811 case 'e':
13812 case 'E':
13813 case 'f':
13814 case 'F':
13815 case 'g':
13816 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013817 if (width == -1 && prec == -1
13818 && !(flags & (F_SIGN | F_BLANK)))
13819 {
13820 /* Fast path */
13821 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13822 goto onError;
13823 goto nextarg;
13824 }
13825
Benjamin Peterson29060642009-01-31 22:14:21 +000013826 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013827 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13830 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 break;
13832
13833 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013834 {
13835 Py_UCS4 ch = formatchar(v);
13836 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013837 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 if (width == -1 && prec == -1) {
13839 /* Fast path */
13840 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13841 goto onError;
13842 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13843 writer.pos += 1;
13844 goto nextarg;
13845 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013846 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013847 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013848 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013849
13850 default:
13851 PyErr_Format(PyExc_ValueError,
13852 "unsupported format character '%c' (0x%x) "
13853 "at index %zd",
13854 (31<=c && c<=126) ? (char)c : '?',
13855 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013856 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013857 goto onError;
13858 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013859 if (temp == NULL)
13860 goto onError;
13861 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013862
13863 if (width == -1 && prec == -1
13864 && !(flags & (F_SIGN | F_BLANK)))
13865 {
13866 /* Fast path */
13867 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13868 goto onError;
13869 goto nextarg;
13870 }
13871
Victor Stinneraff3cc62012-04-30 05:19:21 +020013872 if (PyUnicode_READY(temp) == -1) {
13873 Py_CLEAR(temp);
13874 goto onError;
13875 }
13876 kind = PyUnicode_KIND(temp);
13877 pbuf = PyUnicode_DATA(temp);
13878 len = PyUnicode_GET_LENGTH(temp);
13879
13880 if (c == 's' || c == 'r' || c == 'a') {
13881 if (prec >= 0 && len > prec)
13882 len = prec;
13883 }
13884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885 /* pbuf is initialized here. */
13886 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013888 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13889 if (ch == '-' || ch == '+') {
13890 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013891 len--;
13892 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 }
13894 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013895 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013897 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 else
13899 sign = 0;
13900 }
13901 if (width < len)
13902 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013903
13904 /* Compute the length and maximum character of the
13905 written characters */
13906 bufmaxchar = 127;
13907 if (!(flags & F_LJUST)) {
13908 if (sign) {
13909 if ((width-1) > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013910 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013911 }
13912 else {
13913 if (width > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013914 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013915 }
13916 }
13917 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Benjamin Peterson7e303732013-06-10 09:19:46 -070013918 bufmaxchar = Py_MAX(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013919
13920 buflen = width;
13921 if (sign && len == width)
13922 buflen++;
13923
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013924 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013925 goto onError;
13926
13927 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013929 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013930 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13931 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013932 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013933 if (width > len)
13934 width--;
13935 }
13936 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013937 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013938 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013940 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13941 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13942 writer.pos += 2;
13943 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013944 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013945 width -= 2;
13946 if (width < 0)
13947 width = 0;
13948 len -= 2;
13949 }
13950 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013951 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013952 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13953 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013954 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013955 }
13956 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013957 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013958 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13959 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013960 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013962 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13963 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013964 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13965 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13966 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013967 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 }
13969 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013970
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013971 if (len) {
13972 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13973 temp, pindex, len);
13974 writer.pos += len;
13975 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013976 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013977 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013978 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13979 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013980 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013981
Victor Stinnerd3f08822012-05-29 12:57:52 +020013982nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013983 if (dict && (argidx < arglen) && c != '%') {
13984 PyErr_SetString(PyExc_TypeError,
13985 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013986 goto onError;
13987 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013988 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990 } /* until end */
13991 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013992 PyErr_SetString(PyExc_TypeError,
13993 "not all arguments converted during string formatting");
13994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995 }
13996
13997 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013998 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999 }
14000 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014001 Py_XDECREF(temp);
14002 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014003 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014004
Benjamin Peterson29060642009-01-31 22:14:21 +000014005 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014006 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014007 Py_XDECREF(temp);
14008 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014009 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014010 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014012 }
14013 return NULL;
14014}
14015
Jeremy Hylton938ace62002-07-17 16:30:39 +000014016static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014017unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14018
Tim Peters6d6c1a32001-08-02 04:15:00 +000014019static PyObject *
14020unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14021{
Benjamin Peterson29060642009-01-31 22:14:21 +000014022 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 static char *kwlist[] = {"object", "encoding", "errors", 0};
14024 char *encoding = NULL;
14025 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014026
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 if (type != &PyUnicode_Type)
14028 return unicode_subtype_new(type, args, kwds);
14029 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014030 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 return NULL;
14032 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014033 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 if (encoding == NULL && errors == NULL)
14035 return PyObject_Str(x);
14036 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014037 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014038}
14039
Guido van Rossume023fe02001-08-30 03:12:59 +000014040static PyObject *
14041unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14042{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014043 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014044 Py_ssize_t length, char_size;
14045 int share_wstr, share_utf8;
14046 unsigned int kind;
14047 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014048
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014051 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014054 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014055 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014056 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014057 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014058 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014059
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014060 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014061 if (self == NULL) {
14062 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 return NULL;
14064 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014065 kind = PyUnicode_KIND(unicode);
14066 length = PyUnicode_GET_LENGTH(unicode);
14067
14068 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014069#ifdef Py_DEBUG
14070 _PyUnicode_HASH(self) = -1;
14071#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014072 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014073#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014074 _PyUnicode_STATE(self).interned = 0;
14075 _PyUnicode_STATE(self).kind = kind;
14076 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014077 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014078 _PyUnicode_STATE(self).ready = 1;
14079 _PyUnicode_WSTR(self) = NULL;
14080 _PyUnicode_UTF8_LENGTH(self) = 0;
14081 _PyUnicode_UTF8(self) = NULL;
14082 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014083 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014084
14085 share_utf8 = 0;
14086 share_wstr = 0;
14087 if (kind == PyUnicode_1BYTE_KIND) {
14088 char_size = 1;
14089 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14090 share_utf8 = 1;
14091 }
14092 else if (kind == PyUnicode_2BYTE_KIND) {
14093 char_size = 2;
14094 if (sizeof(wchar_t) == 2)
14095 share_wstr = 1;
14096 }
14097 else {
14098 assert(kind == PyUnicode_4BYTE_KIND);
14099 char_size = 4;
14100 if (sizeof(wchar_t) == 4)
14101 share_wstr = 1;
14102 }
14103
14104 /* Ensure we won't overflow the length. */
14105 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14106 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014107 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014108 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014109 data = PyObject_MALLOC((length + 1) * char_size);
14110 if (data == NULL) {
14111 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014112 goto onError;
14113 }
14114
Victor Stinnerc3c74152011-10-02 20:39:55 +020014115 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014116 if (share_utf8) {
14117 _PyUnicode_UTF8_LENGTH(self) = length;
14118 _PyUnicode_UTF8(self) = data;
14119 }
14120 if (share_wstr) {
14121 _PyUnicode_WSTR_LENGTH(self) = length;
14122 _PyUnicode_WSTR(self) = (wchar_t *)data;
14123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014124
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014125 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014126 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014127 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014128#ifdef Py_DEBUG
14129 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14130#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014131 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014132 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014133
14134onError:
14135 Py_DECREF(unicode);
14136 Py_DECREF(self);
14137 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014138}
14139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014140PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014141"str(object='') -> str\n\
14142str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014143\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014144Create a new string object from the given object. If encoding or\n\
14145errors is specified, then the object must expose a data buffer\n\
14146that will be decoded using the given encoding and error handler.\n\
14147Otherwise, returns the result of object.__str__() (if defined)\n\
14148or repr(object).\n\
14149encoding defaults to sys.getdefaultencoding().\n\
14150errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014151
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014152static PyObject *unicode_iter(PyObject *seq);
14153
Guido van Rossumd57fd912000-03-10 22:53:23 +000014154PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014155 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 "str", /* tp_name */
14157 sizeof(PyUnicodeObject), /* tp_size */
14158 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014159 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 (destructor)unicode_dealloc, /* tp_dealloc */
14161 0, /* tp_print */
14162 0, /* tp_getattr */
14163 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014164 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 unicode_repr, /* tp_repr */
14166 &unicode_as_number, /* tp_as_number */
14167 &unicode_as_sequence, /* tp_as_sequence */
14168 &unicode_as_mapping, /* tp_as_mapping */
14169 (hashfunc) unicode_hash, /* tp_hash*/
14170 0, /* tp_call*/
14171 (reprfunc) unicode_str, /* tp_str */
14172 PyObject_GenericGetAttr, /* tp_getattro */
14173 0, /* tp_setattro */
14174 0, /* tp_as_buffer */
14175 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014176 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 unicode_doc, /* tp_doc */
14178 0, /* tp_traverse */
14179 0, /* tp_clear */
14180 PyUnicode_RichCompare, /* tp_richcompare */
14181 0, /* tp_weaklistoffset */
14182 unicode_iter, /* tp_iter */
14183 0, /* tp_iternext */
14184 unicode_methods, /* tp_methods */
14185 0, /* tp_members */
14186 0, /* tp_getset */
14187 &PyBaseObject_Type, /* tp_base */
14188 0, /* tp_dict */
14189 0, /* tp_descr_get */
14190 0, /* tp_descr_set */
14191 0, /* tp_dictoffset */
14192 0, /* tp_init */
14193 0, /* tp_alloc */
14194 unicode_new, /* tp_new */
14195 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196};
14197
14198/* Initialize the Unicode implementation */
14199
Victor Stinner3a50e702011-10-18 21:21:00 +020014200int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014201{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014202 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014203 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014204 0x000A, /* LINE FEED */
14205 0x000D, /* CARRIAGE RETURN */
14206 0x001C, /* FILE SEPARATOR */
14207 0x001D, /* GROUP SEPARATOR */
14208 0x001E, /* RECORD SEPARATOR */
14209 0x0085, /* NEXT LINE */
14210 0x2028, /* LINE SEPARATOR */
14211 0x2029, /* PARAGRAPH SEPARATOR */
14212 };
14213
Fred Drakee4315f52000-05-09 19:53:39 +000014214 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014215 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014216 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014217 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014218 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014219
Guido van Rossumcacfc072002-05-24 19:01:59 +000014220 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014221 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014222
14223 /* initialize the linebreak bloom filter */
14224 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014225 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014226 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014227
Christian Heimes26532f72013-07-20 14:57:16 +020014228 if (PyType_Ready(&EncodingMapType) < 0)
14229 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014230
Benjamin Petersonc4311282012-10-30 23:21:10 -040014231 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14232 Py_FatalError("Can't initialize field name iterator type");
14233
14234 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14235 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014236
Victor Stinner3a50e702011-10-18 21:21:00 +020014237#ifdef HAVE_MBCS
14238 winver.dwOSVersionInfoSize = sizeof(winver);
14239 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14240 PyErr_SetFromWindowsErr(0);
14241 return -1;
14242 }
14243#endif
14244 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014245}
14246
14247/* Finalize the Unicode implementation */
14248
Christian Heimesa156e092008-02-16 07:38:31 +000014249int
14250PyUnicode_ClearFreeList(void)
14251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014252 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014253}
14254
Guido van Rossumd57fd912000-03-10 22:53:23 +000014255void
Thomas Wouters78890102000-07-22 19:25:51 +000014256_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014257{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014258 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259
Serhiy Storchaka05997252013-01-26 12:14:02 +020014260 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014261
Serhiy Storchaka05997252013-01-26 12:14:02 +020014262 for (i = 0; i < 256; i++)
14263 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014264 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014265 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014266}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014267
Walter Dörwald16807132007-05-25 13:52:07 +000014268void
14269PyUnicode_InternInPlace(PyObject **p)
14270{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014271 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014272 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014273#ifdef Py_DEBUG
14274 assert(s != NULL);
14275 assert(_PyUnicode_CHECK(s));
14276#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014278 return;
14279#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014280 /* If it's a subclass, we don't really know what putting
14281 it in the interned dict might do. */
14282 if (!PyUnicode_CheckExact(s))
14283 return;
14284 if (PyUnicode_CHECK_INTERNED(s))
14285 return;
14286 if (interned == NULL) {
14287 interned = PyDict_New();
14288 if (interned == NULL) {
14289 PyErr_Clear(); /* Don't leave an exception */
14290 return;
14291 }
14292 }
14293 /* It might be that the GetItem call fails even
14294 though the key is present in the dictionary,
14295 namely when this happens during a stack overflow. */
14296 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014297 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014298 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014299
Benjamin Peterson29060642009-01-31 22:14:21 +000014300 if (t) {
14301 Py_INCREF(t);
14302 Py_DECREF(*p);
14303 *p = t;
14304 return;
14305 }
Walter Dörwald16807132007-05-25 13:52:07 +000014306
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014308 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 PyErr_Clear();
14310 PyThreadState_GET()->recursion_critical = 0;
14311 return;
14312 }
14313 PyThreadState_GET()->recursion_critical = 0;
14314 /* The two references in interned are not counted by refcnt.
14315 The deallocator will take care of this */
14316 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014317 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014318}
14319
14320void
14321PyUnicode_InternImmortal(PyObject **p)
14322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014323 PyUnicode_InternInPlace(p);
14324 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014325 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 Py_INCREF(*p);
14327 }
Walter Dörwald16807132007-05-25 13:52:07 +000014328}
14329
14330PyObject *
14331PyUnicode_InternFromString(const char *cp)
14332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014333 PyObject *s = PyUnicode_FromString(cp);
14334 if (s == NULL)
14335 return NULL;
14336 PyUnicode_InternInPlace(&s);
14337 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014338}
14339
Alexander Belopolsky40018472011-02-26 01:02:56 +000014340void
14341_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014342{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014343 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014344 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 Py_ssize_t i, n;
14346 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014347
Benjamin Peterson14339b62009-01-31 16:36:08 +000014348 if (interned == NULL || !PyDict_Check(interned))
14349 return;
14350 keys = PyDict_Keys(interned);
14351 if (keys == NULL || !PyList_Check(keys)) {
14352 PyErr_Clear();
14353 return;
14354 }
Walter Dörwald16807132007-05-25 13:52:07 +000014355
Benjamin Peterson14339b62009-01-31 16:36:08 +000014356 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14357 detector, interned unicode strings are not forcibly deallocated;
14358 rather, we give them their stolen references back, and then clear
14359 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014360
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 n = PyList_GET_SIZE(keys);
14362 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014363 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014365 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014366 if (PyUnicode_READY(s) == -1) {
14367 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014368 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014370 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014371 case SSTATE_NOT_INTERNED:
14372 /* XXX Shouldn't happen */
14373 break;
14374 case SSTATE_INTERNED_IMMORTAL:
14375 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014376 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 break;
14378 case SSTATE_INTERNED_MORTAL:
14379 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 break;
14382 default:
14383 Py_FatalError("Inconsistent interned string state.");
14384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014385 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014386 }
14387 fprintf(stderr, "total size of all interned strings: "
14388 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14389 "mortal/immortal\n", mortal_size, immortal_size);
14390 Py_DECREF(keys);
14391 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014392 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014393}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014394
14395
14396/********************* Unicode Iterator **************************/
14397
14398typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014399 PyObject_HEAD
14400 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014401 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014402} unicodeiterobject;
14403
14404static void
14405unicodeiter_dealloc(unicodeiterobject *it)
14406{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014407 _PyObject_GC_UNTRACK(it);
14408 Py_XDECREF(it->it_seq);
14409 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014410}
14411
14412static int
14413unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14414{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014415 Py_VISIT(it->it_seq);
14416 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014417}
14418
14419static PyObject *
14420unicodeiter_next(unicodeiterobject *it)
14421{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014422 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014423
Benjamin Peterson14339b62009-01-31 16:36:08 +000014424 assert(it != NULL);
14425 seq = it->it_seq;
14426 if (seq == NULL)
14427 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014428 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014430 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14431 int kind = PyUnicode_KIND(seq);
14432 void *data = PyUnicode_DATA(seq);
14433 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14434 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014435 if (item != NULL)
14436 ++it->it_index;
14437 return item;
14438 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014439
Benjamin Peterson14339b62009-01-31 16:36:08 +000014440 Py_DECREF(seq);
14441 it->it_seq = NULL;
14442 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014443}
14444
14445static PyObject *
14446unicodeiter_len(unicodeiterobject *it)
14447{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014448 Py_ssize_t len = 0;
14449 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014450 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014451 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014452}
14453
14454PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14455
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014456static PyObject *
14457unicodeiter_reduce(unicodeiterobject *it)
14458{
14459 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014460 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014461 it->it_seq, it->it_index);
14462 } else {
14463 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14464 if (u == NULL)
14465 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014466 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014467 }
14468}
14469
14470PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14471
14472static PyObject *
14473unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14474{
14475 Py_ssize_t index = PyLong_AsSsize_t(state);
14476 if (index == -1 && PyErr_Occurred())
14477 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000014478 if (it->it_seq != NULL) {
14479 if (index < 0)
14480 index = 0;
14481 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14482 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14483 it->it_index = index;
14484 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014485 Py_RETURN_NONE;
14486}
14487
14488PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14489
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014490static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014491 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014492 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014493 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14494 reduce_doc},
14495 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14496 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014497 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014498};
14499
14500PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014501 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14502 "str_iterator", /* tp_name */
14503 sizeof(unicodeiterobject), /* tp_basicsize */
14504 0, /* tp_itemsize */
14505 /* methods */
14506 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14507 0, /* tp_print */
14508 0, /* tp_getattr */
14509 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014510 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014511 0, /* tp_repr */
14512 0, /* tp_as_number */
14513 0, /* tp_as_sequence */
14514 0, /* tp_as_mapping */
14515 0, /* tp_hash */
14516 0, /* tp_call */
14517 0, /* tp_str */
14518 PyObject_GenericGetAttr, /* tp_getattro */
14519 0, /* tp_setattro */
14520 0, /* tp_as_buffer */
14521 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14522 0, /* tp_doc */
14523 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14524 0, /* tp_clear */
14525 0, /* tp_richcompare */
14526 0, /* tp_weaklistoffset */
14527 PyObject_SelfIter, /* tp_iter */
14528 (iternextfunc)unicodeiter_next, /* tp_iternext */
14529 unicodeiter_methods, /* tp_methods */
14530 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014531};
14532
14533static PyObject *
14534unicode_iter(PyObject *seq)
14535{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014536 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014537
Benjamin Peterson14339b62009-01-31 16:36:08 +000014538 if (!PyUnicode_Check(seq)) {
14539 PyErr_BadInternalCall();
14540 return NULL;
14541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014542 if (PyUnicode_READY(seq) == -1)
14543 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014544 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14545 if (it == NULL)
14546 return NULL;
14547 it->it_index = 0;
14548 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014549 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014550 _PyObject_GC_TRACK(it);
14551 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014552}
14553
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014554
14555size_t
14556Py_UNICODE_strlen(const Py_UNICODE *u)
14557{
14558 int res = 0;
14559 while(*u++)
14560 res++;
14561 return res;
14562}
14563
14564Py_UNICODE*
14565Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14566{
14567 Py_UNICODE *u = s1;
14568 while ((*u++ = *s2++));
14569 return s1;
14570}
14571
14572Py_UNICODE*
14573Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14574{
14575 Py_UNICODE *u = s1;
14576 while ((*u++ = *s2++))
14577 if (n-- == 0)
14578 break;
14579 return s1;
14580}
14581
14582Py_UNICODE*
14583Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14584{
14585 Py_UNICODE *u1 = s1;
14586 u1 += Py_UNICODE_strlen(u1);
14587 Py_UNICODE_strcpy(u1, s2);
14588 return s1;
14589}
14590
14591int
14592Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14593{
14594 while (*s1 && *s2 && *s1 == *s2)
14595 s1++, s2++;
14596 if (*s1 && *s2)
14597 return (*s1 < *s2) ? -1 : +1;
14598 if (*s1)
14599 return 1;
14600 if (*s2)
14601 return -1;
14602 return 0;
14603}
14604
14605int
14606Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14607{
14608 register Py_UNICODE u1, u2;
14609 for (; n != 0; n--) {
14610 u1 = *s1;
14611 u2 = *s2;
14612 if (u1 != u2)
14613 return (u1 < u2) ? -1 : +1;
14614 if (u1 == '\0')
14615 return 0;
14616 s1++;
14617 s2++;
14618 }
14619 return 0;
14620}
14621
14622Py_UNICODE*
14623Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14624{
14625 const Py_UNICODE *p;
14626 for (p = s; *p; p++)
14627 if (*p == c)
14628 return (Py_UNICODE*)p;
14629 return NULL;
14630}
14631
14632Py_UNICODE*
14633Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14634{
14635 const Py_UNICODE *p;
14636 p = s + Py_UNICODE_strlen(s);
14637 while (p != s) {
14638 p--;
14639 if (*p == c)
14640 return (Py_UNICODE*)p;
14641 }
14642 return NULL;
14643}
Victor Stinner331ea922010-08-10 16:37:20 +000014644
Victor Stinner71133ff2010-09-01 23:43:53 +000014645Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014646PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014647{
Victor Stinner577db2c2011-10-11 22:12:48 +020014648 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014649 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014651 if (!PyUnicode_Check(unicode)) {
14652 PyErr_BadArgument();
14653 return NULL;
14654 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014655 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014656 if (u == NULL)
14657 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014658 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014659 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014660 PyErr_NoMemory();
14661 return NULL;
14662 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014663 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014664 size *= sizeof(Py_UNICODE);
14665 copy = PyMem_Malloc(size);
14666 if (copy == NULL) {
14667 PyErr_NoMemory();
14668 return NULL;
14669 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014670 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014671 return copy;
14672}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014673
Georg Brandl66c221e2010-10-14 07:04:07 +000014674/* A _string module, to export formatter_parser and formatter_field_name_split
14675 to the string.Formatter class implemented in Python. */
14676
14677static PyMethodDef _string_methods[] = {
14678 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14679 METH_O, PyDoc_STR("split the argument as a field name")},
14680 {"formatter_parser", (PyCFunction) formatter_parser,
14681 METH_O, PyDoc_STR("parse the argument as a format string")},
14682 {NULL, NULL}
14683};
14684
14685static struct PyModuleDef _string_module = {
14686 PyModuleDef_HEAD_INIT,
14687 "_string",
14688 PyDoc_STR("string helper module"),
14689 0,
14690 _string_methods,
14691 NULL,
14692 NULL,
14693 NULL,
14694 NULL
14695};
14696
14697PyMODINIT_FUNC
14698PyInit__string(void)
14699{
14700 return PyModule_Create(&_string_module);
14701}
14702
14703
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014704#ifdef __cplusplus
14705}
14706#endif