blob: e1ff999e136980c23c6a9cc723d2f0f5278c2141 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinner910337b2011-10-03 03:20:16 +0200115#undef PyUnicode_READY
116#define PyUnicode_READY(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200119 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100120 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200121
Victor Stinnerc379ead2011-10-03 12:52:27 +0200122#define _PyUnicode_SHARE_UTF8(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
125 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
126#define _PyUnicode_SHARE_WSTR(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
129
Victor Stinner829c0ad2011-10-03 01:08:02 +0200130/* true if the Unicode object has an allocated UTF-8 memory block
131 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_HAS_UTF8_MEMORY(op) \
133 (assert(_PyUnicode_CHECK(op)), \
134 (!PyUnicode_IS_COMPACT_ASCII(op) \
135 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200136 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
137
Victor Stinner03490912011-10-03 23:45:12 +0200138/* true if the Unicode object has an allocated wstr memory block
139 (not shared with other data) */
140#define _PyUnicode_HAS_WSTR_MEMORY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (_PyUnicode_WSTR(op) && \
143 (!PyUnicode_IS_READY(op) || \
144 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
145
Victor Stinner910337b2011-10-03 03:20:16 +0200146/* Generic helper macro to convert characters of different types.
147 from_type and to_type have to be valid type names, begin and end
148 are pointers to the source characters which should be of type
149 "from_type *". to is a pointer of type "to_type *" and points to the
150 buffer where the result characters are written to. */
151#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
152 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200153 to_type *_to = (to_type *) to; \
154 const from_type *_iter = (begin); \
155 const from_type *_end = (end); \
156 Py_ssize_t n = (_end) - (_iter); \
157 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200158 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200159 while (_iter < (_unrolled_end)) { \
160 _to[0] = (to_type) _iter[0]; \
161 _to[1] = (to_type) _iter[1]; \
162 _to[2] = (to_type) _iter[2]; \
163 _to[3] = (to_type) _iter[3]; \
164 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 while (_iter < (_end)) \
167 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200168 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200169
Walter Dörwald16807132007-05-25 13:52:07 +0000170/* This dictionary holds all interned unicode strings. Note that references
171 to strings in this dictionary are *not* counted in the string's ob_refcnt.
172 When the interned string reaches a refcnt of 0 the string deallocation
173 function will delete the reference from this dictionary.
174
175 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000176 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000177*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200178static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 do { \
185 if (unicode_empty != NULL) \
186 Py_INCREF(unicode_empty); \
187 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 unicode_empty = PyUnicode_New(0, 0); \
189 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
192 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200193 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000195
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196#define _Py_RETURN_UNICODE_EMPTY() \
197 do { \
198 _Py_INCREF_UNICODE_EMPTY(); \
199 return unicode_empty; \
200 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200202/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200203static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200204
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205/* Single character Unicode strings in the Latin-1 range are being
206 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208
Christian Heimes190d79e2008-01-30 11:58:22 +0000209/* Fast detection of the most frequent whitespace characters */
210const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000212/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000214/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* case 0x000C: * FORM FEED */
216/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 1, 1, 1, 1, 1, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x001C: * FILE SEPARATOR */
220/* case 0x001D: * GROUP SEPARATOR */
221/* case 0x001E: * RECORD SEPARATOR */
222/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000223 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 1, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000229
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000238};
239
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200240/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200242static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100243static int unicode_modifiable(PyObject *unicode);
244
Victor Stinnerfe226c02011-10-03 03:52:20 +0200245
Alexander Belopolsky40018472011-02-26 01:02:56 +0000246static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200248static PyObject *
249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
250static PyObject *
251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
252
253static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000255 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100256 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259static void
260raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300261 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100262 PyObject *unicode,
263 Py_ssize_t startpos, Py_ssize_t endpos,
264 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000265
Christian Heimes190d79e2008-01-30 11:58:22 +0000266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* 0x000B, * LINE TABULATION */
271/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000273 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* 0x001C, * FILE SEPARATOR */
276/* 0x001D, * GROUP SEPARATOR */
277/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 1, 1, 1, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000283
Benjamin Peterson14339b62009-01-31 16:36:08 +0000284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000292};
293
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000297PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000299#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 /* This is actually an illegal character, so it should
303 not be passed to unichr. */
304 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000305#endif
306}
307
Victor Stinner910337b2011-10-03 03:20:16 +0200308#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200309int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100310_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200311{
312 PyASCIIObject *ascii;
313 unsigned int kind;
314
315 assert(PyUnicode_Check(op));
316
317 ascii = (PyASCIIObject *)op;
318 kind = ascii->state.kind;
319
Victor Stinnera3b334d2011-10-03 13:53:37 +0200320 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(ascii->state.ready == 1);
323 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200325 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200326 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200327
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 if (ascii->state.compact == 1) {
329 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200334 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 }
337 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340 data = unicode->data.any;
341 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->length == 0);
343 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ascii == 0);
346 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 assert(ascii->wstr != NULL);
349 assert(data == NULL);
350 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200351 }
352 else {
353 assert(kind == PyUnicode_1BYTE_KIND
354 || kind == PyUnicode_2BYTE_KIND
355 || kind == PyUnicode_4BYTE_KIND);
356 assert(ascii->state.compact == 0);
357 assert(ascii->state.ready == 1);
358 assert(data != NULL);
359 if (ascii->state.ascii) {
360 assert (compact->utf8 == data);
361 assert (compact->utf8_length == ascii->length);
362 }
363 else
364 assert (compact->utf8 != data);
365 }
366 }
367 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 if (
369#if SIZEOF_WCHAR_T == 2
370 kind == PyUnicode_2BYTE_KIND
371#else
372 kind == PyUnicode_4BYTE_KIND
373#endif
374 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 {
376 assert(ascii->wstr == data);
377 assert(compact->wstr_length == ascii->length);
378 } else
379 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200381
382 if (compact->utf8 == NULL)
383 assert(compact->utf8_length == 0);
384 if (ascii->wstr == NULL)
385 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 /* check that the best kind is used */
388 if (check_content && kind != PyUnicode_WCHAR_KIND)
389 {
390 Py_ssize_t i;
391 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 void *data;
393 Py_UCS4 ch;
394
395 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 for (i=0; i < ascii->length; i++)
397 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200398 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200399 if (ch > maxchar)
400 maxchar = ch;
401 }
402 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100403 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 255);
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 else
408 assert(maxchar < 128);
409 }
Victor Stinner77faf692011-11-20 18:56:05 +0100410 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100412 assert(maxchar <= 0xFFFF);
413 }
414 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200415 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100416 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200418 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200419 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400420 return 1;
421}
Victor Stinner910337b2011-10-03 03:20:16 +0200422#endif
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424static PyObject*
425unicode_result_wchar(PyObject *unicode)
426{
427#ifndef Py_DEBUG
428 Py_ssize_t len;
429
430 assert(Py_REFCNT(unicode) == 1);
431
432 len = _PyUnicode_WSTR_LENGTH(unicode);
433 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100434 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200435 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 }
437
438 if (len == 1) {
439 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100440 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
442 Py_DECREF(unicode);
443 return latin1_char;
444 }
445 }
446
447 if (_PyUnicode_Ready(unicode) < 0) {
448 Py_XDECREF(unicode);
449 return NULL;
450 }
451#else
452 /* don't make the result ready in debug mode to ensure that the caller
453 makes the string ready before using it */
454 assert(_PyUnicode_CheckConsistency(unicode, 1));
455#endif
456 return unicode;
457}
458
459static PyObject*
460unicode_result_ready(PyObject *unicode)
461{
462 Py_ssize_t length;
463
464 length = PyUnicode_GET_LENGTH(unicode);
465 if (length == 0) {
466 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100467 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200468 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100469 }
470 return unicode_empty;
471 }
472
473 if (length == 1) {
474 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
475 if (ch < 256) {
476 PyObject *latin1_char = unicode_latin1[ch];
477 if (latin1_char != NULL) {
478 if (unicode != latin1_char) {
479 Py_INCREF(latin1_char);
480 Py_DECREF(unicode);
481 }
482 return latin1_char;
483 }
484 else {
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 Py_INCREF(unicode);
487 unicode_latin1[ch] = unicode;
488 return unicode;
489 }
490 }
491 }
492
493 assert(_PyUnicode_CheckConsistency(unicode, 1));
494 return unicode;
495}
496
497static PyObject*
498unicode_result(PyObject *unicode)
499{
500 assert(_PyUnicode_CHECK(unicode));
501 if (PyUnicode_IS_READY(unicode))
502 return unicode_result_ready(unicode);
503 else
504 return unicode_result_wchar(unicode);
505}
506
Victor Stinnerc4b49542011-12-11 22:44:26 +0100507static PyObject*
508unicode_result_unchanged(PyObject *unicode)
509{
510 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500511 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512 return NULL;
513 Py_INCREF(unicode);
514 return unicode;
515 }
516 else
517 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100518 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100519}
520
Victor Stinner3a50e702011-10-18 21:21:00 +0200521#ifdef HAVE_MBCS
522static OSVERSIONINFOEX winver;
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525/* --- Bloom Filters ----------------------------------------------------- */
526
527/* stuff to implement simple "bloom filters" for Unicode characters.
528 to keep things simple, we use a single bitmask, using the least 5
529 bits from each unicode characters as the bit index. */
530
531/* the linebreak mask is set up by Unicode_Init below */
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#if LONG_BIT >= 128
534#define BLOOM_WIDTH 128
535#elif LONG_BIT >= 64
536#define BLOOM_WIDTH 64
537#elif LONG_BIT >= 32
538#define BLOOM_WIDTH 32
539#else
540#error "LONG_BIT is smaller than 32"
541#endif
542
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543#define BLOOM_MASK unsigned long
544
Serhiy Storchaka05997252013-01-26 12:14:02 +0200545static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546
Antoine Pitrouf068f942010-01-13 14:19:12 +0000547#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
557 /* calculate simple bloom-style bitmask for a given unicode string */
558
Antoine Pitrouf068f942010-01-13 14:19:12 +0000559 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000560 Py_ssize_t i;
561
562 mask = 0;
563 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565
566 return mask;
567}
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569#define BLOOM_MEMBER(mask, chr, str) \
570 (BLOOM(mask, chr) \
571 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200573/* Compilation of templated routines */
574
575#include "stringlib/asciilib.h"
576#include "stringlib/fastsearch.h"
577#include "stringlib/partition.h"
578#include "stringlib/split.h"
579#include "stringlib/count.h"
580#include "stringlib/find.h"
581#include "stringlib/find_max_char.h"
582#include "stringlib/localeutil.h"
583#include "stringlib/undef.h"
584
585#include "stringlib/ucs1lib.h"
586#include "stringlib/fastsearch.h"
587#include "stringlib/partition.h"
588#include "stringlib/split.h"
589#include "stringlib/count.h"
590#include "stringlib/find.h"
591#include "stringlib/find_max_char.h"
592#include "stringlib/localeutil.h"
593#include "stringlib/undef.h"
594
595#include "stringlib/ucs2lib.h"
596#include "stringlib/fastsearch.h"
597#include "stringlib/partition.h"
598#include "stringlib/split.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
601#include "stringlib/find_max_char.h"
602#include "stringlib/localeutil.h"
603#include "stringlib/undef.h"
604
605#include "stringlib/ucs4lib.h"
606#include "stringlib/fastsearch.h"
607#include "stringlib/partition.h"
608#include "stringlib/split.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200615#include "stringlib/unicodedefs.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100619#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621/* --- Unicode Object ----------------------------------------------------- */
622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200624fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200626Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
627 Py_ssize_t size, Py_UCS4 ch,
628 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200630 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
631
632 switch (kind) {
633 case PyUnicode_1BYTE_KIND:
634 {
635 Py_UCS1 ch1 = (Py_UCS1) ch;
636 if (ch1 == ch)
637 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_2BYTE_KIND:
642 {
643 Py_UCS2 ch2 = (Py_UCS2) ch;
644 if (ch2 == ch)
645 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
646 else
647 return -1;
648 }
649 case PyUnicode_4BYTE_KIND:
650 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
651 default:
652 assert(0);
653 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655}
656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657static PyObject*
658resize_compact(PyObject *unicode, Py_ssize_t length)
659{
660 Py_ssize_t char_size;
661 Py_ssize_t struct_size;
662 Py_ssize_t new_size;
663 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100664 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200665 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100667 assert(PyUnicode_IS_COMPACT(unicode));
668
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200669 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100670 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 struct_size = sizeof(PyASCIIObject);
672 else
673 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200674 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
677 PyErr_NoMemory();
678 return NULL;
679 }
680 new_size = (struct_size + (length + 1) * char_size);
681
Victor Stinner84def372011-12-11 20:04:56 +0100682 _Py_DEC_REFTOTAL;
683 _Py_ForgetReference(unicode);
684
685 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
686 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100687 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 PyErr_NoMemory();
689 return NULL;
690 }
Victor Stinner84def372011-12-11 20:04:56 +0100691 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200695 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100697 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200698 _PyUnicode_WSTR_LENGTH(unicode) = length;
699 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100700 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
701 PyObject_DEL(_PyUnicode_WSTR(unicode));
702 _PyUnicode_WSTR(unicode) = NULL;
703 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
722
723 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200724 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
726 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727
728 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
729 PyErr_NoMemory();
730 return -1;
731 }
732 new_size = (length + 1) * char_size;
733
Victor Stinner7a9105a2011-12-12 00:13:42 +0100734 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
735 {
736 PyObject_DEL(_PyUnicode_UTF8(unicode));
737 _PyUnicode_UTF8(unicode) = NULL;
738 _PyUnicode_UTF8_LENGTH(unicode) = 0;
739 }
740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 data = (PyObject *)PyObject_REALLOC(data, new_size);
742 if (data == NULL) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200747 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200749 _PyUnicode_WSTR_LENGTH(unicode) = length;
750 }
751 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200752 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200753 _PyUnicode_UTF8_LENGTH(unicode) = length;
754 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 _PyUnicode_LENGTH(unicode) = length;
756 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200757 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200758 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 }
Victor Stinner95663112011-10-04 01:03:50 +0200762 assert(_PyUnicode_WSTR(unicode) != NULL);
763
764 /* check for integer overflow */
765 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
766 PyErr_NoMemory();
767 return -1;
768 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200770 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200772 if (!wstr) {
773 PyErr_NoMemory();
774 return -1;
775 }
776 _PyUnicode_WSTR(unicode) = wstr;
777 _PyUnicode_WSTR(unicode)[length] = 0;
778 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200779 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 return 0;
781}
782
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783static PyObject*
784resize_copy(PyObject *unicode, Py_ssize_t length)
785{
786 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100787 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100789
Benjamin Petersonbac79492012-01-14 13:34:47 -0500790 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792
793 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
794 if (copy == NULL)
795 return NULL;
796
797 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200798 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200800 }
801 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200802 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200804 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 if (w == NULL)
806 return NULL;
807 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
808 copy_length = Py_MIN(copy_length, length);
809 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
810 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200811 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
813}
814
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000816 Ux0000 terminated; some code (e.g. new_identifier)
817 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818
819 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
822*/
823
Alexander Belopolsky40018472011-02-26 01:02:56 +0000824static PyUnicodeObject *
825_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826{
827 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 if (length == 0 && unicode_empty != NULL) {
832 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200833 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 }
835
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000836 /* Ensure we won't overflow the size. */
837 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
838 return (PyUnicodeObject *)PyErr_NoMemory();
839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 if (length < 0) {
841 PyErr_SetString(PyExc_SystemError,
842 "Negative size passed to _PyUnicode_New");
843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000844 }
845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
847 if (unicode == NULL)
848 return NULL;
849 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
850 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
851 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100852 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000853 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100854 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856
Jeremy Hyltond8082792003-09-16 19:41:39 +0000857 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000858 * the caller fails before initializing str -- unicode_resize()
859 * reads str[0], and the Keep-Alive optimization can keep memory
860 * allocated for str alive across a call to unicode_dealloc(unicode).
861 * We don't want unicode_resize to read uninitialized memory in
862 * that case.
863 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 _PyUnicode_WSTR(unicode)[0] = 0;
865 _PyUnicode_WSTR(unicode)[length] = 0;
866 _PyUnicode_WSTR_LENGTH(unicode) = length;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = 0;
870 _PyUnicode_STATE(unicode).compact = 0;
871 _PyUnicode_STATE(unicode).ready = 0;
872 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200873 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200875 _PyUnicode_UTF8(unicode) = NULL;
876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100877 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878 return unicode;
879}
880
Victor Stinnerf42dc442011-10-02 23:33:16 +0200881static const char*
882unicode_kind_name(PyObject *unicode)
883{
Victor Stinner42dfd712011-10-03 14:41:45 +0200884 /* don't check consistency: unicode_kind_name() is called from
885 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886 if (!PyUnicode_IS_COMPACT(unicode))
887 {
888 if (!PyUnicode_IS_READY(unicode))
889 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600890 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "legacy ascii";
895 else
896 return "legacy latin1";
897 case PyUnicode_2BYTE_KIND:
898 return "legacy UCS2";
899 case PyUnicode_4BYTE_KIND:
900 return "legacy UCS4";
901 default:
902 return "<legacy invalid kind>";
903 }
904 }
905 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600906 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200907 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200908 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200909 return "ascii";
910 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200913 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200914 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200915 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200916 default:
917 return "<invalid compact kind>";
918 }
919}
920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922/* Functions wrapping macros for use in debugger */
923char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200924 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925}
926
927void *_PyUnicode_compact_data(void *unicode) {
928 return _PyUnicode_COMPACT_DATA(unicode);
929}
930void *_PyUnicode_data(void *unicode){
931 printf("obj %p\n", unicode);
932 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
933 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
934 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
935 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
936 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
937 return PyUnicode_DATA(unicode);
938}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200939
940void
941_PyUnicode_Dump(PyObject *op)
942{
943 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
945 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
946 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200947
Victor Stinnera849a4b2011-10-03 12:12:11 +0200948 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200949 {
950 if (ascii->state.ascii)
951 data = (ascii + 1);
952 else
953 data = (compact + 1);
954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 else
956 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200957 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
958
Victor Stinnera849a4b2011-10-03 12:12:11 +0200959 if (ascii->wstr == data)
960 printf("shared ");
961 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200962
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200964 printf(" (%zu), ", compact->wstr_length);
965 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
966 printf("shared ");
967 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200970}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971#endif
972
973PyObject *
974PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
975{
976 PyObject *obj;
977 PyCompactUnicodeObject *unicode;
978 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200979 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Py_ssize_t char_size;
982 Py_ssize_t struct_size;
983
984 /* Optimization for empty strings */
985 if (size == 0 && unicode_empty != NULL) {
986 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200987 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 }
989
Victor Stinner9e9d6892011-10-04 01:02:02 +0200990 is_ascii = 0;
991 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 struct_size = sizeof(PyCompactUnicodeObject);
993 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200994 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 char_size = 1;
996 is_ascii = 1;
997 struct_size = sizeof(PyASCIIObject);
998 }
999 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 }
1003 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001004 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 char_size = 2;
1006 if (sizeof(wchar_t) == 2)
1007 is_sharing = 1;
1008 }
1009 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001010 if (maxchar > MAX_UNICODE) {
1011 PyErr_SetString(PyExc_SystemError,
1012 "invalid maximum character passed to PyUnicode_New");
1013 return NULL;
1014 }
Victor Stinner8f825062012-04-27 13:55:39 +02001015 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 char_size = 4;
1017 if (sizeof(wchar_t) == 4)
1018 is_sharing = 1;
1019 }
1020
1021 /* Ensure we won't overflow the size. */
1022 if (size < 0) {
1023 PyErr_SetString(PyExc_SystemError,
1024 "Negative size passed to PyUnicode_New");
1025 return NULL;
1026 }
1027 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1028 return PyErr_NoMemory();
1029
1030 /* Duplicated allocation code from _PyObject_New() instead of a call to
1031 * PyObject_New() so we are able to allocate space for the object and
1032 * it's data buffer.
1033 */
1034 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1035 if (obj == NULL)
1036 return PyErr_NoMemory();
1037 obj = PyObject_INIT(obj, &PyUnicode_Type);
1038 if (obj == NULL)
1039 return NULL;
1040
1041 unicode = (PyCompactUnicodeObject *)obj;
1042 if (is_ascii)
1043 data = ((PyASCIIObject*)obj) + 1;
1044 else
1045 data = unicode + 1;
1046 _PyUnicode_LENGTH(unicode) = size;
1047 _PyUnicode_HASH(unicode) = -1;
1048 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 _PyUnicode_STATE(unicode).compact = 1;
1051 _PyUnicode_STATE(unicode).ready = 1;
1052 _PyUnicode_STATE(unicode).ascii = is_ascii;
1053 if (is_ascii) {
1054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 }
Victor Stinner8f825062012-04-27 13:55:39 +02001057 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 ((char*)data)[size] = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 else {
1065 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001066 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001067 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS4*)data)[size] = 0;
1071 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 }
1075 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 }
1079 }
Victor Stinner8f825062012-04-27 13:55:39 +02001080#ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier.
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 return obj;
1089}
1090
1091#if SIZEOF_WCHAR_T == 2
1092/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001094 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095
1096 This function assumes that unicode can hold one more code point than wstr
1097 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001098static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001100 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
1102 const wchar_t *iter;
1103 Py_UCS4 *ucs4_out;
1104
Victor Stinner910337b2011-10-03 03:20:16 +02001105 assert(unicode != NULL);
1106 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1108 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1109
1110 for (iter = begin; iter < end; ) {
1111 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001113 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1114 && (iter+1) < end
1115 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 {
Victor Stinner551ac952011-11-29 22:58:13 +01001117 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 iter += 2;
1119 }
1120 else {
1121 *ucs4_out++ = *iter;
1122 iter++;
1123 }
1124 }
1125 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1126 _PyUnicode_GET_LENGTH(unicode)));
1127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128}
1129#endif
1130
Victor Stinnercd9950f2011-10-02 00:34:53 +02001131static int
Victor Stinner488fa492011-12-12 00:01:39 +01001132unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133{
Victor Stinner488fa492011-12-12 00:01:39 +01001134 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001135 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001136 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001137 return -1;
1138 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return 0;
1140}
1141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142static int
1143_copy_characters(PyObject *to, Py_ssize_t to_start,
1144 PyObject *from, Py_ssize_t from_start,
1145 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 unsigned int from_kind, to_kind;
1148 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149
Victor Stinneree4544c2012-05-09 22:24:08 +02001150 assert(0 <= how_many);
1151 assert(0 <= from_start);
1152 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001155 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156
Victor Stinnerd3f08822012-05-29 12:57:52 +02001157 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(to));
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1160
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001161 if (how_many == 0)
1162 return 0;
1163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168
Victor Stinnerf1852262012-06-16 16:38:26 +02001169#ifdef Py_DEBUG
1170 if (!check_maxchar
1171 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1172 {
1173 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1174 Py_UCS4 ch;
1175 Py_ssize_t i;
1176 for (i=0; i < how_many; i++) {
1177 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1178 assert(ch <= to_maxchar);
1179 }
1180 }
1181#endif
1182
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001183 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001184 if (check_maxchar
1185 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1186 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001187 /* Writing Latin-1 characters into an ASCII string requires to
1188 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001189 Py_UCS4 max_char;
1190 max_char = ucs1lib_find_max_char(from_data,
1191 (Py_UCS1*)from_data + how_many);
1192 if (max_char >= 128)
1193 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001194 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001195 Py_MEMCPY((char*)to_data + to_kind * to_start,
1196 (char*)from_data + from_kind * from_start,
1197 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else if (from_kind == PyUnicode_1BYTE_KIND
1200 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS1, Py_UCS2,
1204 PyUnicode_1BYTE_DATA(from) + from_start,
1205 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_2BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 && to_kind == PyUnicode_4BYTE_KIND)
1211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS4,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_4BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
1219 else if (from_kind == PyUnicode_2BYTE_KIND
1220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS2, Py_UCS4,
1224 PyUnicode_2BYTE_DATA(from) + from_start,
1225 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001230 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1231
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 if (!check_maxchar) {
1233 if (from_kind == PyUnicode_2BYTE_KIND
1234 && to_kind == PyUnicode_1BYTE_KIND)
1235 {
1236 _PyUnicode_CONVERT_BYTES(
1237 Py_UCS2, Py_UCS1,
1238 PyUnicode_2BYTE_DATA(from) + from_start,
1239 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1240 PyUnicode_1BYTE_DATA(to) + to_start
1241 );
1242 }
1243 else if (from_kind == PyUnicode_4BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS4, Py_UCS1,
1248 PyUnicode_4BYTE_DATA(from) + from_start,
1249 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_2BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS2,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_2BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else {
1264 assert(0);
1265 return -1;
1266 }
1267 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001268 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001269 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 Py_ssize_t i;
1272
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 for (i=0; i < how_many; i++) {
1274 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001275 if (ch > to_maxchar)
1276 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1278 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 }
1280 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001281 return 0;
1282}
1283
Victor Stinnerd3f08822012-05-29 12:57:52 +02001284void
1285_PyUnicode_FastCopyCharacters(
1286 PyObject *to, Py_ssize_t to_start,
1287 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288{
1289 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1290}
1291
1292Py_ssize_t
1293PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1294 PyObject *from, Py_ssize_t from_start,
1295 Py_ssize_t how_many)
1296{
1297 int err;
1298
1299 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1300 PyErr_BadInternalCall();
1301 return -1;
1302 }
1303
Benjamin Petersonbac79492012-01-14 13:34:47 -05001304 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001305 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
1308
Victor Stinnerd3f08822012-05-29 12:57:52 +02001309 if (from_start < 0) {
1310 PyErr_SetString(PyExc_IndexError, "string index out of range");
1311 return -1;
1312 }
1313 if (to_start < 0) {
1314 PyErr_SetString(PyExc_IndexError, "string index out of range");
1315 return -1;
1316 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1318 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1319 PyErr_Format(PyExc_SystemError,
1320 "Cannot write %zi characters at %zi "
1321 "in a string of %zi characters",
1322 how_many, to_start, PyUnicode_GET_LENGTH(to));
1323 return -1;
1324 }
1325
1326 if (how_many == 0)
1327 return 0;
1328
Victor Stinner488fa492011-12-12 00:01:39 +01001329 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 return -1;
1331
1332 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1333 if (err) {
1334 PyErr_Format(PyExc_SystemError,
1335 "Cannot copy %s characters "
1336 "into a string of %s characters",
1337 unicode_kind_name(from),
1338 unicode_kind_name(to));
1339 return -1;
1340 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001341 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342}
1343
Victor Stinner17222162011-09-28 22:15:37 +02001344/* Find the maximum code point and count the number of surrogate pairs so a
1345 correct string length can be computed before converting a string to UCS4.
1346 This function counts single surrogates as a character and not as a pair.
1347
1348 Return 0 on success, or -1 on error. */
1349static int
1350find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1351 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352{
1353 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001354 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355
Victor Stinnerc53be962011-10-02 21:33:54 +02001356 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 *num_surrogates = 0;
1358 *maxchar = 0;
1359
1360 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001362 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1363 && (iter+1) < end
1364 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001366 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 iter += 2;
1369 }
1370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001372 {
1373 ch = *iter;
1374 iter++;
1375 }
1376 if (ch > *maxchar) {
1377 *maxchar = ch;
1378 if (*maxchar > MAX_UNICODE) {
1379 PyErr_Format(PyExc_ValueError,
1380 "character U+%x is not in range [U+0000; U+10ffff]",
1381 ch);
1382 return -1;
1383 }
1384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 return 0;
1387}
1388
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001389int
1390_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391{
1392 wchar_t *end;
1393 Py_UCS4 maxchar = 0;
1394 Py_ssize_t num_surrogates;
1395#if SIZEOF_WCHAR_T == 2
1396 Py_ssize_t length_wo_surrogates;
1397#endif
1398
Georg Brandl7597add2011-10-05 16:36:47 +02001399 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001400 strings were created using _PyObject_New() and where no canonical
1401 representation (the str field) has been set yet aka strings
1402 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001403 assert(_PyUnicode_CHECK(unicode));
1404 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001406 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001407 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001408 /* Actually, it should neither be interned nor be anything else: */
1409 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001412 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001413 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
1416 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1418 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 PyErr_NoMemory();
1420 return -1;
1421 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001422 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 _PyUnicode_WSTR(unicode), end,
1424 PyUnicode_1BYTE_DATA(unicode));
1425 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1426 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1427 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1428 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001429 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001430 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432 }
1433 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001434 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001435 _PyUnicode_UTF8(unicode) = NULL;
1436 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 PyObject_FREE(_PyUnicode_WSTR(unicode));
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1441 }
1442 /* In this case we might have to convert down from 4-byte native
1443 wchar_t to 2-byte unicode. */
1444 else if (maxchar < 65536) {
1445 assert(num_surrogates == 0 &&
1446 "FindMaxCharAndNumSurrogatePairs() messed up");
1447
Victor Stinner506f5922011-09-28 22:34:18 +02001448#if SIZEOF_WCHAR_T == 2
1449 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001451 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1452 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1453 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001454 _PyUnicode_UTF8(unicode) = NULL;
1455 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001456#else
1457 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001459 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyErr_NoMemory();
1462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 }
Victor Stinner506f5922011-09-28 22:34:18 +02001464 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1465 _PyUnicode_WSTR(unicode), end,
1466 PyUnicode_2BYTE_DATA(unicode));
1467 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1468 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1469 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001472 PyObject_FREE(_PyUnicode_WSTR(unicode));
1473 _PyUnicode_WSTR(unicode) = NULL;
1474 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1475#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 }
1477 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1478 else {
1479#if SIZEOF_WCHAR_T == 2
1480 /* in case the native representation is 2-bytes, we need to allocate a
1481 new normalized 4-byte version. */
1482 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1484 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 PyErr_NoMemory();
1486 return -1;
1487 }
1488 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1489 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001492 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1493 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001494 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyObject_FREE(_PyUnicode_WSTR(unicode));
1496 _PyUnicode_WSTR(unicode) = NULL;
1497 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1498#else
1499 assert(num_surrogates == 0);
1500
Victor Stinnerc3c74152011-10-02 20:39:55 +02001501 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001503 _PyUnicode_UTF8(unicode) = NULL;
1504 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1506#endif
1507 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1508 }
1509 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001510 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 return 0;
1512}
1513
Alexander Belopolsky40018472011-02-26 01:02:56 +00001514static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001515unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516{
Walter Dörwald16807132007-05-25 13:52:07 +00001517 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 case SSTATE_NOT_INTERNED:
1519 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001520
Benjamin Peterson29060642009-01-31 22:14:21 +00001521 case SSTATE_INTERNED_MORTAL:
1522 /* revive dead object temporarily for DelItem */
1523 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001524 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001525 Py_FatalError(
1526 "deletion of interned string failed");
1527 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001528
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 case SSTATE_INTERNED_IMMORTAL:
1530 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001531
Benjamin Peterson29060642009-01-31 22:14:21 +00001532 default:
1533 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001534 }
1535
Victor Stinner03490912011-10-03 23:45:12 +02001536 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001538 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001539 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001540 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1541 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001543 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544}
1545
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546#ifdef Py_DEBUG
1547static int
1548unicode_is_singleton(PyObject *unicode)
1549{
1550 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1551 if (unicode == unicode_empty)
1552 return 1;
1553 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1554 {
1555 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1556 if (ch < 256 && unicode_latin1[ch] == unicode)
1557 return 1;
1558 }
1559 return 0;
1560}
1561#endif
1562
Alexander Belopolsky40018472011-02-26 01:02:56 +00001563static int
Victor Stinner488fa492011-12-12 00:01:39 +01001564unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001565{
Victor Stinner488fa492011-12-12 00:01:39 +01001566 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567 if (Py_REFCNT(unicode) != 1)
1568 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001569 if (_PyUnicode_HASH(unicode) != -1)
1570 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001571 if (PyUnicode_CHECK_INTERNED(unicode))
1572 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001573 if (!PyUnicode_CheckExact(unicode))
1574 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001575#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001576 /* singleton refcount is greater than 1 */
1577 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001578#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 return 1;
1580}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001581
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582static int
1583unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1584{
1585 PyObject *unicode;
1586 Py_ssize_t old_length;
1587
1588 assert(p_unicode != NULL);
1589 unicode = *p_unicode;
1590
1591 assert(unicode != NULL);
1592 assert(PyUnicode_Check(unicode));
1593 assert(0 <= length);
1594
Victor Stinner910337b2011-10-03 03:20:16 +02001595 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 old_length = PyUnicode_WSTR_LENGTH(unicode);
1597 else
1598 old_length = PyUnicode_GET_LENGTH(unicode);
1599 if (old_length == length)
1600 return 0;
1601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001603 _Py_INCREF_UNICODE_EMPTY();
1604 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606 Py_DECREF(*p_unicode);
1607 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001648unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650{
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664}
1665
1666static int
1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001670 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001671 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677}
1678
Victor Stinnerc5166102012-02-22 13:55:02 +01001679/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001680
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001681 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001684static void
1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001687{
1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001690 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001691
1692 switch (kind) {
1693 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001694 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001695 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001696 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001697 }
1698 case PyUnicode_2BYTE_KIND: {
1699 Py_UCS2 *start = (Py_UCS2 *)data + index;
1700 Py_UCS2 *ucs2 = start;
1701 assert(index <= PyUnicode_GET_LENGTH(unicode));
1702
Victor Stinner184252a2012-06-16 02:57:41 +02001703 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001704 *ucs2 = (Py_UCS2)*str;
1705
1706 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001707 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001708 }
1709 default: {
1710 Py_UCS4 *start = (Py_UCS4 *)data + index;
1711 Py_UCS4 *ucs4 = start;
1712 assert(kind == PyUnicode_4BYTE_KIND);
1713 assert(index <= PyUnicode_GET_LENGTH(unicode));
1714
Victor Stinner184252a2012-06-16 02:57:41 +02001715 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001716 *ucs4 = (Py_UCS4)*str;
1717
1718 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 }
1720 }
1721}
1722
1723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724static PyObject*
1725get_latin1_char(unsigned char ch)
1726{
Victor Stinnera464fc12011-10-02 20:39:30 +02001727 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001729 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 if (!unicode)
1731 return NULL;
1732 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001733 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 unicode_latin1[ch] = unicode;
1735 }
1736 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001737 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738}
1739
Alexander Belopolsky40018472011-02-26 01:02:56 +00001740PyObject *
1741PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001743 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 Py_UCS4 maxchar = 0;
1745 Py_ssize_t num_surrogates;
1746
1747 if (u == NULL)
1748 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001750 /* If the Unicode data is known at construction time, we can apply
1751 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001754 if (size == 0)
1755 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 /* Single character Unicode objects in the Latin-1 range are
1758 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001759 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return get_latin1_char((unsigned char)*u);
1761
1762 /* If not empty and not single character, copy the Unicode data
1763 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001764 if (find_maxchar_surrogates(u, u + size,
1765 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 return NULL;
1767
Victor Stinner8faf8212011-12-08 22:14:11 +01001768 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 if (!unicode)
1770 return NULL;
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 switch (PyUnicode_KIND(unicode)) {
1773 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001774 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1776 break;
1777 case PyUnicode_2BYTE_KIND:
1778#if Py_UNICODE_SIZE == 2
1779 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1780#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001781 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1783#endif
1784 break;
1785 case PyUnicode_4BYTE_KIND:
1786#if SIZEOF_WCHAR_T == 2
1787 /* This is the only case which has to process surrogates, thus
1788 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001789 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790#else
1791 assert(num_surrogates == 0);
1792 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1793#endif
1794 break;
1795 default:
1796 assert(0 && "Impossible state");
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001799 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800}
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802PyObject *
1803PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001804{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001805 if (size < 0) {
1806 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 return NULL;
1809 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001810 if (u != NULL)
1811 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1812 else
1813 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001814}
1815
Alexander Belopolsky40018472011-02-26 01:02:56 +00001816PyObject *
1817PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001818{
1819 size_t size = strlen(u);
1820 if (size > PY_SSIZE_T_MAX) {
1821 PyErr_SetString(PyExc_OverflowError, "input too long");
1822 return NULL;
1823 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001824 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001825}
1826
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001827PyObject *
1828_PyUnicode_FromId(_Py_Identifier *id)
1829{
1830 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001831 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1832 strlen(id->string),
1833 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001834 if (!id->object)
1835 return NULL;
1836 PyUnicode_InternInPlace(&id->object);
1837 assert(!id->next);
1838 id->next = static_strings;
1839 static_strings = id;
1840 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001841 return id->object;
1842}
1843
1844void
1845_PyUnicode_ClearStaticStrings()
1846{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001847 _Py_Identifier *tmp, *s = static_strings;
1848 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001849 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001850 tmp = s->next;
1851 s->next = NULL;
1852 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001853 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001854 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855}
1856
Benjamin Peterson0df54292012-03-26 14:50:32 -04001857/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001858
Victor Stinnerd3f08822012-05-29 12:57:52 +02001859PyObject*
1860_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001861{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001862 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001863 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001864 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001865#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001866 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001867#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001868 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001869 }
Victor Stinner785938e2011-12-11 20:09:03 +01001870 unicode = PyUnicode_New(size, 127);
1871 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001872 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001873 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1874 assert(_PyUnicode_CheckConsistency(unicode, 1));
1875 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001876}
1877
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001878static Py_UCS4
1879kind_maxchar_limit(unsigned int kind)
1880{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001881 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001882 case PyUnicode_1BYTE_KIND:
1883 return 0x80;
1884 case PyUnicode_2BYTE_KIND:
1885 return 0x100;
1886 case PyUnicode_4BYTE_KIND:
1887 return 0x10000;
1888 default:
1889 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001890 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001891 }
1892}
1893
Victor Stinnere6abb482012-05-02 01:15:40 +02001894Py_LOCAL_INLINE(Py_UCS4)
1895align_maxchar(Py_UCS4 maxchar)
1896{
1897 if (maxchar <= 127)
1898 return 127;
1899 else if (maxchar <= 255)
1900 return 255;
1901 else if (maxchar <= 65535)
1902 return 65535;
1903 else
1904 return MAX_UNICODE;
1905}
1906
Victor Stinner702c7342011-10-05 13:50:52 +02001907static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001908_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001911 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001912
Serhiy Storchaka678db842013-01-26 12:16:36 +02001913 if (size == 0)
1914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001916 if (size == 1)
1917 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001919 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 if (!res)
1922 return NULL;
1923 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001924 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001926}
1927
Victor Stinnere57b1c02011-09-28 22:20:48 +02001928static PyObject*
1929_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930{
1931 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001932 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001933
Serhiy Storchaka678db842013-01-26 12:16:36 +02001934 if (size == 0)
1935 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001937 if (size == 1) {
1938 Py_UCS4 ch = u[0];
1939 if (ch < 256)
1940 return get_latin1_char((unsigned char)ch);
1941
1942 res = PyUnicode_New(1, ch);
1943 if (res == NULL)
1944 return NULL;
1945 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1946 assert(_PyUnicode_CheckConsistency(res, 1));
1947 return res;
1948 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001949
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001950 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!res)
1953 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 else {
1957 _PyUnicode_CONVERT_BYTES(
1958 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1959 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001960 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 return res;
1962}
1963
Victor Stinnere57b1c02011-09-28 22:20:48 +02001964static PyObject*
1965_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966{
1967 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001968 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969
Serhiy Storchaka678db842013-01-26 12:16:36 +02001970 if (size == 0)
1971 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001972 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001973 if (size == 1) {
1974 Py_UCS4 ch = u[0];
1975 if (ch < 256)
1976 return get_latin1_char((unsigned char)ch);
1977
1978 res = PyUnicode_New(1, ch);
1979 if (res == NULL)
1980 return NULL;
1981 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1982 assert(_PyUnicode_CheckConsistency(res, 1));
1983 return res;
1984 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001985
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001986 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 if (!res)
1989 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001990 if (max_char < 256)
1991 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1992 PyUnicode_1BYTE_DATA(res));
1993 else if (max_char < 0x10000)
1994 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1995 PyUnicode_2BYTE_DATA(res));
1996 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 return res;
2000}
2001
2002PyObject*
2003PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2004{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002005 if (size < 0) {
2006 PyErr_SetString(PyExc_ValueError, "size must be positive");
2007 return NULL;
2008 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002009 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002011 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002015 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002016 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017 PyErr_SetString(PyExc_SystemError, "invalid kind");
2018 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020}
2021
Victor Stinnerece58de2012-04-23 23:36:38 +02002022Py_UCS4
2023_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2024{
2025 enum PyUnicode_Kind kind;
2026 void *startptr, *endptr;
2027
2028 assert(PyUnicode_IS_READY(unicode));
2029 assert(0 <= start);
2030 assert(end <= PyUnicode_GET_LENGTH(unicode));
2031 assert(start <= end);
2032
2033 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2034 return PyUnicode_MAX_CHAR_VALUE(unicode);
2035
2036 if (start == end)
2037 return 127;
2038
Victor Stinner94d558b2012-04-27 22:26:58 +02002039 if (PyUnicode_IS_ASCII(unicode))
2040 return 127;
2041
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002043 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002044 endptr = (char *)startptr + end * kind;
2045 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002046 switch(kind) {
2047 case PyUnicode_1BYTE_KIND:
2048 return ucs1lib_find_max_char(startptr, endptr);
2049 case PyUnicode_2BYTE_KIND:
2050 return ucs2lib_find_max_char(startptr, endptr);
2051 case PyUnicode_4BYTE_KIND:
2052 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002053 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002054 assert(0);
2055 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002056 }
2057}
2058
Victor Stinner25a4b292011-10-06 12:31:55 +02002059/* Ensure that a string uses the most efficient storage, if it is not the
2060 case: create a new string with of the right kind. Write NULL into *p_unicode
2061 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002062static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002063unicode_adjust_maxchar(PyObject **p_unicode)
2064{
2065 PyObject *unicode, *copy;
2066 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002067 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002068 unsigned int kind;
2069
2070 assert(p_unicode != NULL);
2071 unicode = *p_unicode;
2072 assert(PyUnicode_IS_READY(unicode));
2073 if (PyUnicode_IS_ASCII(unicode))
2074 return;
2075
2076 len = PyUnicode_GET_LENGTH(unicode);
2077 kind = PyUnicode_KIND(unicode);
2078 if (kind == PyUnicode_1BYTE_KIND) {
2079 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 max_char = ucs1lib_find_max_char(u, u + len);
2081 if (max_char >= 128)
2082 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 }
2084 else if (kind == PyUnicode_2BYTE_KIND) {
2085 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002086 max_char = ucs2lib_find_max_char(u, u + len);
2087 if (max_char >= 256)
2088 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002089 }
2090 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002091 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002092 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 max_char = ucs4lib_find_max_char(u, u + len);
2094 if (max_char >= 0x10000)
2095 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002096 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002098 if (copy != NULL)
2099 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002100 Py_DECREF(unicode);
2101 *p_unicode = copy;
2102}
2103
Victor Stinner034f6cf2011-09-30 02:26:44 +02002104PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002105_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002106{
Victor Stinner87af4f22011-11-21 23:03:47 +01002107 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002109
Victor Stinner034f6cf2011-09-30 02:26:44 +02002110 if (!PyUnicode_Check(unicode)) {
2111 PyErr_BadInternalCall();
2112 return NULL;
2113 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002114 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002115 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002116
Victor Stinner87af4f22011-11-21 23:03:47 +01002117 length = PyUnicode_GET_LENGTH(unicode);
2118 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002119 if (!copy)
2120 return NULL;
2121 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2122
Victor Stinner87af4f22011-11-21 23:03:47 +01002123 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2124 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002125 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002126 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129
Victor Stinnerbc603d12011-10-02 01:00:40 +02002130/* Widen Unicode objects to larger buffers. Don't write terminating null
2131 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132
2133void*
2134_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2135{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136 Py_ssize_t len;
2137 void *result;
2138 unsigned int skind;
2139
Benjamin Petersonbac79492012-01-14 13:34:47 -05002140 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002141 return NULL;
2142
2143 len = PyUnicode_GET_LENGTH(s);
2144 skind = PyUnicode_KIND(s);
2145 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002146 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 return NULL;
2148 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002149 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 case PyUnicode_2BYTE_KIND:
2151 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2152 if (!result)
2153 return PyErr_NoMemory();
2154 assert(skind == PyUnicode_1BYTE_KIND);
2155 _PyUnicode_CONVERT_BYTES(
2156 Py_UCS1, Py_UCS2,
2157 PyUnicode_1BYTE_DATA(s),
2158 PyUnicode_1BYTE_DATA(s) + len,
2159 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002161 case PyUnicode_4BYTE_KIND:
2162 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2163 if (!result)
2164 return PyErr_NoMemory();
2165 if (skind == PyUnicode_2BYTE_KIND) {
2166 _PyUnicode_CONVERT_BYTES(
2167 Py_UCS2, Py_UCS4,
2168 PyUnicode_2BYTE_DATA(s),
2169 PyUnicode_2BYTE_DATA(s) + len,
2170 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 else {
2173 assert(skind == PyUnicode_1BYTE_KIND);
2174 _PyUnicode_CONVERT_BYTES(
2175 Py_UCS1, Py_UCS4,
2176 PyUnicode_1BYTE_DATA(s),
2177 PyUnicode_1BYTE_DATA(s) + len,
2178 result);
2179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 default:
2182 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 }
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186}
2187
2188static Py_UCS4*
2189as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2190 int copy_null)
2191{
2192 int kind;
2193 void *data;
2194 Py_ssize_t len, targetlen;
2195 if (PyUnicode_READY(string) == -1)
2196 return NULL;
2197 kind = PyUnicode_KIND(string);
2198 data = PyUnicode_DATA(string);
2199 len = PyUnicode_GET_LENGTH(string);
2200 targetlen = len;
2201 if (copy_null)
2202 targetlen++;
2203 if (!target) {
2204 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2205 PyErr_NoMemory();
2206 return NULL;
2207 }
2208 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2209 if (!target) {
2210 PyErr_NoMemory();
2211 return NULL;
2212 }
2213 }
2214 else {
2215 if (targetsize < targetlen) {
2216 PyErr_Format(PyExc_SystemError,
2217 "string is longer than the buffer");
2218 if (copy_null && 0 < targetsize)
2219 target[0] = 0;
2220 return NULL;
2221 }
2222 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002223 if (kind == PyUnicode_1BYTE_KIND) {
2224 Py_UCS1 *start = (Py_UCS1 *) data;
2225 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002227 else if (kind == PyUnicode_2BYTE_KIND) {
2228 Py_UCS2 *start = (Py_UCS2 *) data;
2229 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2230 }
2231 else {
2232 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 if (copy_null)
2236 target[len] = 0;
2237 return target;
2238}
2239
2240Py_UCS4*
2241PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2242 int copy_null)
2243{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002244 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 PyErr_BadInternalCall();
2246 return NULL;
2247 }
2248 return as_ucs4(string, target, targetsize, copy_null);
2249}
2250
2251Py_UCS4*
2252PyUnicode_AsUCS4Copy(PyObject *string)
2253{
2254 return as_ucs4(string, NULL, 0, 1);
2255}
2256
2257#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002258
Alexander Belopolsky40018472011-02-26 01:02:56 +00002259PyObject *
2260PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002264 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 PyErr_BadInternalCall();
2266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 }
2268
Martin v. Löwis790465f2008-04-05 20:41:37 +00002269 if (size == -1) {
2270 size = wcslen(w);
2271 }
2272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002277
Walter Dörwald346737f2007-05-31 10:44:43 +00002278static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002279makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2280 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002281{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 *fmt++ = '%';
2283 if (width) {
2284 if (zeropad)
2285 *fmt++ = '0';
2286 fmt += sprintf(fmt, "%d", width);
2287 }
2288 if (precision)
2289 fmt += sprintf(fmt, ".%d", precision);
2290 if (longflag)
2291 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002292 else if (longlongflag) {
2293 /* longlongflag should only ever be nonzero on machines with
2294 HAVE_LONG_LONG defined */
2295#ifdef HAVE_LONG_LONG
2296 char *f = PY_FORMAT_LONG_LONG;
2297 while (*f)
2298 *fmt++ = *f++;
2299#else
2300 /* we shouldn't ever get here */
2301 assert(0);
2302 *fmt++ = 'l';
2303#endif
2304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002305 else if (size_tflag) {
2306 char *f = PY_FORMAT_SIZE_T;
2307 while (*f)
2308 *fmt++ = *f++;
2309 }
2310 *fmt++ = c;
2311 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002312}
2313
Victor Stinner96865452011-03-01 23:44:09 +00002314/* helper for PyUnicode_FromFormatV() */
2315
2316static const char*
2317parse_format_flags(const char *f,
2318 int *p_width, int *p_precision,
2319 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2320{
2321 int width, precision, longflag, longlongflag, size_tflag;
2322
2323 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2324 f++;
2325 width = 0;
2326 while (Py_ISDIGIT((unsigned)*f))
2327 width = (width*10) + *f++ - '0';
2328 precision = 0;
2329 if (*f == '.') {
2330 f++;
2331 while (Py_ISDIGIT((unsigned)*f))
2332 precision = (precision*10) + *f++ - '0';
2333 if (*f == '%') {
2334 /* "%.3%s" => f points to "3" */
2335 f--;
2336 }
2337 }
2338 if (*f == '\0') {
2339 /* bogus format "%.1" => go backward, f points to "1" */
2340 f--;
2341 }
2342 if (p_width != NULL)
2343 *p_width = width;
2344 if (p_precision != NULL)
2345 *p_precision = precision;
2346
2347 /* Handle %ld, %lu, %lld and %llu. */
2348 longflag = 0;
2349 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002350 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002351
2352 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002353 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002354 longflag = 1;
2355 ++f;
2356 }
2357#ifdef HAVE_LONG_LONG
2358 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002359 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002360 longlongflag = 1;
2361 f += 2;
2362 }
2363#endif
2364 }
2365 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002366 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002367 size_tflag = 1;
2368 ++f;
2369 }
2370 if (p_longflag != NULL)
2371 *p_longflag = longflag;
2372 if (p_longlongflag != NULL)
2373 *p_longlongflag = longlongflag;
2374 if (p_size_tflag != NULL)
2375 *p_size_tflag = size_tflag;
2376 return f;
2377}
2378
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002379/* maximum number of characters required for output of %ld. 21 characters
2380 allows for 64-bit integers (in decimal) and an optional sign. */
2381#define MAX_LONG_CHARS 21
2382/* maximum number of characters required for output of %lld.
2383 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2384 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2385#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2386
Walter Dörwaldd2034312007-05-18 16:29:38 +00002387PyObject *
2388PyUnicode_FromFormatV(const char *format, va_list vargs)
2389{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 va_list count;
2391 Py_ssize_t callcount = 0;
2392 PyObject **callresults = NULL;
2393 PyObject **callresult = NULL;
2394 Py_ssize_t n = 0;
2395 int width = 0;
2396 int precision = 0;
2397 int zeropad;
2398 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002399 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002400 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002401 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2403 Py_UCS4 argmaxchar;
2404 Py_ssize_t numbersize = 0;
2405 char *numberresults = NULL;
2406 char *numberresult = NULL;
2407 Py_ssize_t i;
2408 int kind;
2409 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002410
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002411 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002412 /* step 1: count the number of %S/%R/%A/%s format specifications
2413 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2414 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002416 * also estimate a upper bound for all the number formats in the string,
2417 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002419 for (f = format; *f; f++) {
2420 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002421 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2423 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2424 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2425 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002428#ifdef HAVE_LONG_LONG
2429 if (longlongflag) {
2430 if (width < MAX_LONG_LONG_CHARS)
2431 width = MAX_LONG_LONG_CHARS;
2432 }
2433 else
2434#endif
2435 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2436 including sign. Decimal takes the most space. This
2437 isn't enough for octal. If a width is specified we
2438 need more (which we allocate later). */
2439 if (width < MAX_LONG_CHARS)
2440 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441
2442 /* account for the size + '\0' to separate numbers
2443 inside of the numberresults buffer */
2444 numbersize += (width + 1);
2445 }
2446 }
2447 else if ((unsigned char)*f > 127) {
2448 PyErr_Format(PyExc_ValueError,
2449 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2450 "string, got a non-ASCII byte: 0x%02x",
2451 (unsigned char)*f);
2452 return NULL;
2453 }
2454 }
2455 /* step 2: allocate memory for the results of
2456 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2457 if (callcount) {
2458 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2459 if (!callresults) {
2460 PyErr_NoMemory();
2461 return NULL;
2462 }
2463 callresult = callresults;
2464 }
2465 /* step 2.5: allocate memory for the results of formating numbers */
2466 if (numbersize) {
2467 numberresults = PyObject_Malloc(numbersize);
2468 if (!numberresults) {
2469 PyErr_NoMemory();
2470 goto fail;
2471 }
2472 numberresult = numberresults;
2473 }
2474
2475 /* step 3: format numbers and figure out how large a buffer we need */
2476 for (f = format; *f; f++) {
2477 if (*f == '%') {
2478 const char* p;
2479 int longflag;
2480 int longlongflag;
2481 int size_tflag;
2482 int numprinted;
2483
2484 p = f;
2485 zeropad = (f[1] == '0');
2486 f = parse_format_flags(f, &width, &precision,
2487 &longflag, &longlongflag, &size_tflag);
2488 switch (*f) {
2489 case 'c':
2490 {
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002491 int ordinal = va_arg(count, int);
2492 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2493 PyErr_SetString(PyExc_OverflowError,
2494 "%c arg not in range(0x110000)");
2495 goto fail;
2496 }
2497 maxchar = Py_MAX(maxchar, (Py_UCS4)ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 n++;
2499 break;
2500 }
2501 case '%':
2502 n++;
2503 break;
2504 case 'i':
2505 case 'd':
2506 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2507 width, precision, *f);
2508 if (longflag)
2509 numprinted = sprintf(numberresult, fmt,
2510 va_arg(count, long));
2511#ifdef HAVE_LONG_LONG
2512 else if (longlongflag)
2513 numprinted = sprintf(numberresult, fmt,
2514 va_arg(count, PY_LONG_LONG));
2515#endif
2516 else if (size_tflag)
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, Py_ssize_t));
2519 else
2520 numprinted = sprintf(numberresult, fmt,
2521 va_arg(count, int));
2522 n += numprinted;
2523 /* advance by +1 to skip over the '\0' */
2524 numberresult += (numprinted + 1);
2525 assert(*(numberresult - 1) == '\0');
2526 assert(*(numberresult - 2) != '\0');
2527 assert(numprinted >= 0);
2528 assert(numberresult <= numberresults + numbersize);
2529 break;
2530 case 'u':
2531 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2532 width, precision, 'u');
2533 if (longflag)
2534 numprinted = sprintf(numberresult, fmt,
2535 va_arg(count, unsigned long));
2536#ifdef HAVE_LONG_LONG
2537 else if (longlongflag)
2538 numprinted = sprintf(numberresult, fmt,
2539 va_arg(count, unsigned PY_LONG_LONG));
2540#endif
2541 else if (size_tflag)
2542 numprinted = sprintf(numberresult, fmt,
2543 va_arg(count, size_t));
2544 else
2545 numprinted = sprintf(numberresult, fmt,
2546 va_arg(count, unsigned int));
2547 n += numprinted;
2548 numberresult += (numprinted + 1);
2549 assert(*(numberresult - 1) == '\0');
2550 assert(*(numberresult - 2) != '\0');
2551 assert(numprinted >= 0);
2552 assert(numberresult <= numberresults + numbersize);
2553 break;
2554 case 'x':
2555 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2556 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2557 n += numprinted;
2558 numberresult += (numprinted + 1);
2559 assert(*(numberresult - 1) == '\0');
2560 assert(*(numberresult - 2) != '\0');
2561 assert(numprinted >= 0);
2562 assert(numberresult <= numberresults + numbersize);
2563 break;
2564 case 'p':
2565 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2566 /* %p is ill-defined: ensure leading 0x. */
2567 if (numberresult[1] == 'X')
2568 numberresult[1] = 'x';
2569 else if (numberresult[1] != 'x') {
2570 memmove(numberresult + 2, numberresult,
2571 strlen(numberresult) + 1);
2572 numberresult[0] = '0';
2573 numberresult[1] = 'x';
2574 numprinted += 2;
2575 }
2576 n += numprinted;
2577 numberresult += (numprinted + 1);
2578 assert(*(numberresult - 1) == '\0');
2579 assert(*(numberresult - 2) != '\0');
2580 assert(numprinted >= 0);
2581 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 case 's':
2584 {
2585 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002586 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002587 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002588 if (!str)
2589 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 /* since PyUnicode_DecodeUTF8 returns already flexible
2591 unicode objects, there is no need to call ready on them */
2592 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002593 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002595 /* Remember the str and switch to the next slot */
2596 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 break;
2598 }
2599 case 'U':
2600 {
2601 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002602 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (PyUnicode_READY(obj) == -1)
2604 goto fail;
2605 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002606 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 break;
2609 }
2610 case 'V':
2611 {
2612 PyObject *obj = va_arg(count, PyObject *);
2613 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002614 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002616 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002617 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 if (PyUnicode_READY(obj) == -1)
2619 goto fail;
2620 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002621 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002623 *callresult++ = NULL;
2624 }
2625 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002626 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002627 if (!str_obj)
2628 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002629 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002630 Py_DECREF(str_obj);
2631 goto fail;
2632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002634 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002636 *callresult++ = str_obj;
2637 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 break;
2639 }
2640 case 'S':
2641 {
2642 PyObject *obj = va_arg(count, PyObject *);
2643 PyObject *str;
2644 assert(obj);
2645 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002646 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002648 if (PyUnicode_READY(str) == -1) {
2649 Py_DECREF(str);
2650 goto fail;
2651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002653 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* Remember the str and switch to the next slot */
2656 *callresult++ = str;
2657 break;
2658 }
2659 case 'R':
2660 {
2661 PyObject *obj = va_arg(count, PyObject *);
2662 PyObject *repr;
2663 assert(obj);
2664 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002665 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (PyUnicode_READY(repr) == -1) {
2668 Py_DECREF(repr);
2669 goto fail;
2670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002672 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 /* Remember the repr and switch to the next slot */
2675 *callresult++ = repr;
2676 break;
2677 }
2678 case 'A':
2679 {
2680 PyObject *obj = va_arg(count, PyObject *);
2681 PyObject *ascii;
2682 assert(obj);
2683 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002684 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002686 if (PyUnicode_READY(ascii) == -1) {
2687 Py_DECREF(ascii);
2688 goto fail;
2689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002691 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 /* Remember the repr and switch to the next slot */
2694 *callresult++ = ascii;
2695 break;
2696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 default:
2698 /* if we stumble upon an unknown
2699 formatting code, copy the rest of
2700 the format string to the output
2701 string. (we cannot just skip the
2702 code, since there's no way to know
2703 what's in the argument list) */
2704 n += strlen(p);
2705 goto expand;
2706 }
2707 } else
2708 n++;
2709 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002710 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 we don't have to resize the string.
2714 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002715 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 if (!string)
2717 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 kind = PyUnicode_KIND(string);
2719 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002725 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002726
2727 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2729 /* checking for == because the last argument could be a empty
2730 string, which causes i to point to end, the assert at the end of
2731 the loop */
2732 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002733
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 switch (*f) {
2735 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002736 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 const int ordinal = va_arg(vargs, int);
2738 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002740 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002741 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002746 {
Victor Stinner184252a2012-06-16 02:57:41 +02002747 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 /* unused, since we already have the result */
2749 if (*f == 'p')
2750 (void) va_arg(vargs, void *);
2751 else
2752 (void) va_arg(vargs, int);
2753 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002754 len = strlen(numberresult);
2755 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002757 i += len;
2758 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002759 assert(*numberresult == '\0');
2760 numberresult++;
2761 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002763 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002764 case 's':
2765 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002766 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002768 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 size = PyUnicode_GET_LENGTH(*callresult);
2770 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002771 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002773 /* We're done with the unicode()/repr() => forget it */
2774 Py_DECREF(*callresult);
2775 /* switch to next unicode()/repr() result */
2776 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 break;
2778 }
2779 case 'U':
2780 {
2781 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 Py_ssize_t size;
2783 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2784 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002785 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 break;
2788 }
2789 case 'V':
2790 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002792 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002793 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795 size = PyUnicode_GET_LENGTH(obj);
2796 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002797 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 size = PyUnicode_GET_LENGTH(*callresult);
2801 assert(PyUnicode_KIND(*callresult) <=
2802 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002803 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002805 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002807 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002808 break;
2809 }
2810 case 'S':
2811 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002812 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002814 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 /* unused, since we already have the result */
2816 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002818 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002819 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002820 /* We're done with the unicode()/repr() => forget it */
2821 Py_DECREF(*callresult);
2822 /* switch to next unicode()/repr() result */
2823 ++callresult;
2824 break;
2825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002826 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002828 break;
2829 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002830 {
2831 Py_ssize_t len = strlen(p);
2832 unicode_write_cstr(string, i, p, len);
2833 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002834 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002835 goto end;
2836 }
Victor Stinner184252a2012-06-16 02:57:41 +02002837 }
Victor Stinner1205f272010-09-11 00:54:47 +00002838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 else {
2840 assert(i < PyUnicode_GET_LENGTH(string));
2841 PyUnicode_WRITE(kind, data, i++, *f);
2842 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002844 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002845
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002847 if (callresults)
2848 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002849 if (numberresults)
2850 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002851 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002853 if (callresults) {
2854 PyObject **callresult2 = callresults;
2855 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002856 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002857 ++callresult2;
2858 }
2859 PyObject_Free(callresults);
2860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002861 if (numberresults)
2862 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002864}
2865
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866PyObject *
2867PyUnicode_FromFormat(const char *format, ...)
2868{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002869 PyObject* ret;
2870 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871
2872#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002873 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002874#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 ret = PyUnicode_FromFormatV(format, vargs);
2878 va_end(vargs);
2879 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002880}
2881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882#ifdef HAVE_WCHAR_H
2883
Victor Stinner5593d8a2010-10-02 11:11:27 +00002884/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2885 convert a Unicode object to a wide character string.
2886
Victor Stinnerd88d9832011-09-06 02:00:05 +02002887 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002888 character) required to convert the unicode object. Ignore size argument.
2889
Victor Stinnerd88d9832011-09-06 02:00:05 +02002890 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002891 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002892 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002893static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002894unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002895 wchar_t *w,
2896 Py_ssize_t size)
2897{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002898 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 const wchar_t *wstr;
2900
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002901 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 if (wstr == NULL)
2903 return -1;
2904
Victor Stinner5593d8a2010-10-02 11:11:27 +00002905 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002906 if (size > res)
2907 size = res + 1;
2908 else
2909 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002910 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002911 return res;
2912 }
2913 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002914 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002915}
2916
2917Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002918PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002919 wchar_t *w,
2920 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921{
2922 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 PyErr_BadInternalCall();
2924 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002926 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927}
2928
Victor Stinner137c34c2010-09-29 10:25:54 +00002929wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002930PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002931 Py_ssize_t *size)
2932{
2933 wchar_t* buffer;
2934 Py_ssize_t buflen;
2935
2936 if (unicode == NULL) {
2937 PyErr_BadInternalCall();
2938 return NULL;
2939 }
2940
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002941 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942 if (buflen == -1)
2943 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002945 PyErr_NoMemory();
2946 return NULL;
2947 }
2948
Victor Stinner137c34c2010-09-29 10:25:54 +00002949 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2950 if (buffer == NULL) {
2951 PyErr_NoMemory();
2952 return NULL;
2953 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002954 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002955 if (buflen == -1) {
2956 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002958 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 if (size != NULL)
2960 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002961 return buffer;
2962}
2963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965
Alexander Belopolsky40018472011-02-26 01:02:56 +00002966PyObject *
2967PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002970 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002971 PyErr_SetString(PyExc_ValueError,
2972 "chr() arg not in range(0x110000)");
2973 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002974 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002975
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002976 if ((Py_UCS4)ordinal < 256)
2977 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002979 v = PyUnicode_New(1, ordinal);
2980 if (v == NULL)
2981 return NULL;
2982 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002983 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002984 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002990 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002992 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002993 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002994 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002995 Py_INCREF(obj);
2996 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002997 }
2998 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 /* For a Unicode subtype that's not a Unicode object,
3000 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003001 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003002 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003003 PyErr_Format(PyExc_TypeError,
3004 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003005 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003006 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003007}
3008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003011 const char *encoding,
3012 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003013{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003014 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003015 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003016
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 PyErr_BadInternalCall();
3019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003021
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003022 /* Decoding bytes objects is the most common case and should be fast */
3023 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003024 if (PyBytes_GET_SIZE(obj) == 0)
3025 _Py_RETURN_UNICODE_EMPTY();
3026 v = PyUnicode_Decode(
3027 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3028 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003029 return v;
3030 }
3031
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003032 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyErr_SetString(PyExc_TypeError,
3034 "decoding str is not supported");
3035 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003036 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003038 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3039 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3040 PyErr_Format(PyExc_TypeError,
3041 "coercing to str: need bytes, bytearray "
3042 "or buffer-like object, %.80s found",
3043 Py_TYPE(obj)->tp_name);
3044 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003045 }
Tim Petersced69f82003-09-16 20:30:58 +00003046
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003047 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003048 PyBuffer_Release(&buffer);
3049 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003051
Serhiy Storchaka05997252013-01-26 12:14:02 +02003052 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003053 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055}
3056
Victor Stinner600d3be2010-06-10 12:00:55 +00003057/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003058 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3059 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003060int
3061_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003062 char *lower,
3063 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003065 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003066 char *l;
3067 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003068
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003069 if (encoding == NULL) {
3070 strcpy(lower, "utf-8");
3071 return 1;
3072 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003073 e = encoding;
3074 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003075 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003076 while (*e) {
3077 if (l == l_end)
3078 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003079 if (Py_ISUPPER(*e)) {
3080 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003081 }
3082 else if (*e == '_') {
3083 *l++ = '-';
3084 e++;
3085 }
3086 else {
3087 *l++ = *e++;
3088 }
3089 }
3090 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003091 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003092}
3093
Alexander Belopolsky40018472011-02-26 01:02:56 +00003094PyObject *
3095PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003096 Py_ssize_t size,
3097 const char *encoding,
3098 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003099{
3100 PyObject *buffer = NULL, *unicode;
3101 Py_buffer info;
3102 char lower[11]; /* Enough for any encoding shortcut */
3103
Fred Drakee4315f52000-05-09 19:53:39 +00003104 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003105 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003106 if ((strcmp(lower, "utf-8") == 0) ||
3107 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003108 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003109 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003110 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003111 (strcmp(lower, "iso-8859-1") == 0))
3112 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003113#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003114 else if (strcmp(lower, "mbcs") == 0)
3115 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003116#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003117 else if (strcmp(lower, "ascii") == 0)
3118 return PyUnicode_DecodeASCII(s, size, errors);
3119 else if (strcmp(lower, "utf-16") == 0)
3120 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3121 else if (strcmp(lower, "utf-32") == 0)
3122 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124
3125 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003126 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003127 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003128 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003129 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 if (buffer == NULL)
3131 goto onError;
3132 unicode = PyCodec_Decode(buffer, encoding, errors);
3133 if (unicode == NULL)
3134 goto onError;
3135 if (!PyUnicode_Check(unicode)) {
3136 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003137 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003138 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 Py_DECREF(unicode);
3140 goto onError;
3141 }
3142 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003143 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003144
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146 Py_XDECREF(buffer);
3147 return NULL;
3148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150PyObject *
3151PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003152 const char *encoding,
3153 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154{
3155 PyObject *v;
3156
3157 if (!PyUnicode_Check(unicode)) {
3158 PyErr_BadArgument();
3159 goto onError;
3160 }
3161
3162 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003163 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003164
3165 /* Decode via the codec registry */
3166 v = PyCodec_Decode(unicode, encoding, errors);
3167 if (v == NULL)
3168 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003169 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003170
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003172 return NULL;
3173}
3174
Alexander Belopolsky40018472011-02-26 01:02:56 +00003175PyObject *
3176PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003177 const char *encoding,
3178 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003179{
3180 PyObject *v;
3181
3182 if (!PyUnicode_Check(unicode)) {
3183 PyErr_BadArgument();
3184 goto onError;
3185 }
3186
3187 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189
3190 /* Decode via the codec registry */
3191 v = PyCodec_Decode(unicode, encoding, errors);
3192 if (v == NULL)
3193 goto onError;
3194 if (!PyUnicode_Check(v)) {
3195 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003196 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197 Py_TYPE(v)->tp_name);
3198 Py_DECREF(v);
3199 goto onError;
3200 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003201 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202
Benjamin Peterson29060642009-01-31 22:14:21 +00003203 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204 return NULL;
3205}
3206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003209 Py_ssize_t size,
3210 const char *encoding,
3211 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212{
3213 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003214
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 unicode = PyUnicode_FromUnicode(s, size);
3216 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3219 Py_DECREF(unicode);
3220 return v;
3221}
3222
Alexander Belopolsky40018472011-02-26 01:02:56 +00003223PyObject *
3224PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003225 const char *encoding,
3226 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003227{
3228 PyObject *v;
3229
3230 if (!PyUnicode_Check(unicode)) {
3231 PyErr_BadArgument();
3232 goto onError;
3233 }
3234
3235 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003237
3238 /* Encode via the codec registry */
3239 v = PyCodec_Encode(unicode, encoding, errors);
3240 if (v == NULL)
3241 goto onError;
3242 return v;
3243
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003245 return NULL;
3246}
3247
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003248static size_t
3249wcstombs_errorpos(const wchar_t *wstr)
3250{
3251 size_t len;
3252#if SIZEOF_WCHAR_T == 2
3253 wchar_t buf[3];
3254#else
3255 wchar_t buf[2];
3256#endif
3257 char outbuf[MB_LEN_MAX];
3258 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003259
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260#if SIZEOF_WCHAR_T == 2
3261 buf[2] = 0;
3262#else
3263 buf[1] = 0;
3264#endif
3265 start = wstr;
3266 while (*wstr != L'\0')
3267 {
3268 previous = wstr;
3269#if SIZEOF_WCHAR_T == 2
3270 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3271 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3272 {
3273 buf[0] = wstr[0];
3274 buf[1] = wstr[1];
3275 wstr += 2;
3276 }
3277 else {
3278 buf[0] = *wstr;
3279 buf[1] = 0;
3280 wstr++;
3281 }
3282#else
3283 buf[0] = *wstr;
3284 wstr++;
3285#endif
3286 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003287 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003288 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003289 }
3290
3291 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 return 0;
3293}
3294
Victor Stinner1b579672011-12-17 05:47:23 +01003295static int
3296locale_error_handler(const char *errors, int *surrogateescape)
3297{
3298 if (errors == NULL) {
3299 *surrogateescape = 0;
3300 return 0;
3301 }
3302
3303 if (strcmp(errors, "strict") == 0) {
3304 *surrogateescape = 0;
3305 return 0;
3306 }
3307 if (strcmp(errors, "surrogateescape") == 0) {
3308 *surrogateescape = 1;
3309 return 0;
3310 }
3311 PyErr_Format(PyExc_ValueError,
3312 "only 'strict' and 'surrogateescape' error handlers "
3313 "are supported, not '%s'",
3314 errors);
3315 return -1;
3316}
3317
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003318PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003319PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320{
3321 Py_ssize_t wlen, wlen2;
3322 wchar_t *wstr;
3323 PyObject *bytes = NULL;
3324 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003325 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003326 PyObject *exc;
3327 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003328 int surrogateescape;
3329
3330 if (locale_error_handler(errors, &surrogateescape) < 0)
3331 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003332
3333 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3334 if (wstr == NULL)
3335 return NULL;
3336
3337 wlen2 = wcslen(wstr);
3338 if (wlen2 != wlen) {
3339 PyMem_Free(wstr);
3340 PyErr_SetString(PyExc_TypeError, "embedded null character");
3341 return NULL;
3342 }
3343
3344 if (surrogateescape) {
3345 /* locale encoding with surrogateescape */
3346 char *str;
3347
3348 str = _Py_wchar2char(wstr, &error_pos);
3349 if (str == NULL) {
3350 if (error_pos == (size_t)-1) {
3351 PyErr_NoMemory();
3352 PyMem_Free(wstr);
3353 return NULL;
3354 }
3355 else {
3356 goto encode_error;
3357 }
3358 }
3359 PyMem_Free(wstr);
3360
3361 bytes = PyBytes_FromString(str);
3362 PyMem_Free(str);
3363 }
3364 else {
3365 size_t len, len2;
3366
3367 len = wcstombs(NULL, wstr, 0);
3368 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003369 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 goto encode_error;
3371 }
3372
3373 bytes = PyBytes_FromStringAndSize(NULL, len);
3374 if (bytes == NULL) {
3375 PyMem_Free(wstr);
3376 return NULL;
3377 }
3378
3379 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3380 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003381 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003382 goto encode_error;
3383 }
3384 PyMem_Free(wstr);
3385 }
3386 return bytes;
3387
3388encode_error:
3389 errmsg = strerror(errno);
3390 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003391
3392 if (error_pos == (size_t)-1)
3393 error_pos = wcstombs_errorpos(wstr);
3394
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003395 PyMem_Free(wstr);
3396 Py_XDECREF(bytes);
3397
Victor Stinner2f197072011-12-17 07:08:30 +01003398 if (errmsg != NULL) {
3399 size_t errlen;
3400 wstr = _Py_char2wchar(errmsg, &errlen);
3401 if (wstr != NULL) {
3402 reason = PyUnicode_FromWideChar(wstr, errlen);
3403 PyMem_Free(wstr);
3404 } else
3405 errmsg = NULL;
3406 }
3407 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003408 reason = PyUnicode_FromString(
3409 "wcstombs() encountered an unencodable "
3410 "wide character");
3411 if (reason == NULL)
3412 return NULL;
3413
3414 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3415 "locale", unicode,
3416 (Py_ssize_t)error_pos,
3417 (Py_ssize_t)(error_pos+1),
3418 reason);
3419 Py_DECREF(reason);
3420 if (exc != NULL) {
3421 PyCodec_StrictErrors(exc);
3422 Py_XDECREF(exc);
3423 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003424 return NULL;
3425}
3426
Victor Stinnerad158722010-10-27 00:25:46 +00003427PyObject *
3428PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003429{
Victor Stinner99b95382011-07-04 14:23:54 +02003430#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003431 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003432#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003433 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003434#else
Victor Stinner793b5312011-04-27 00:24:21 +02003435 PyInterpreterState *interp = PyThreadState_GET()->interp;
3436 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3437 cannot use it to encode and decode filenames before it is loaded. Load
3438 the Python codec requires to encode at least its own filename. Use the C
3439 version of the locale codec until the codec registry is initialized and
3440 the Python codec is loaded.
3441
3442 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3443 cannot only rely on it: check also interp->fscodec_initialized for
3444 subinterpreters. */
3445 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003446 return PyUnicode_AsEncodedString(unicode,
3447 Py_FileSystemDefaultEncoding,
3448 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003449 }
3450 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003451 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003452 }
Victor Stinnerad158722010-10-27 00:25:46 +00003453#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003454}
3455
Alexander Belopolsky40018472011-02-26 01:02:56 +00003456PyObject *
3457PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003458 const char *encoding,
3459 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460{
3461 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003462 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003463
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 if (!PyUnicode_Check(unicode)) {
3465 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 }
Fred Drakee4315f52000-05-09 19:53:39 +00003468
Fred Drakee4315f52000-05-09 19:53:39 +00003469 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003470 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003471 if ((strcmp(lower, "utf-8") == 0) ||
3472 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003473 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003474 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003476 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003477 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003478 }
Victor Stinner37296e82010-06-10 13:36:23 +00003479 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003480 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003481 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003483#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003484 else if (strcmp(lower, "mbcs") == 0)
3485 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003486#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003487 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003488 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490
3491 /* Encode via the codec registry */
3492 v = PyCodec_Encode(unicode, encoding, errors);
3493 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003494 return NULL;
3495
3496 /* The normal path */
3497 if (PyBytes_Check(v))
3498 return v;
3499
3500 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003501 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003502 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003503 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003504
3505 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3506 "encoder %s returned bytearray instead of bytes",
3507 encoding);
3508 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003509 Py_DECREF(v);
3510 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003511 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003512
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003513 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3514 Py_DECREF(v);
3515 return b;
3516 }
3517
3518 PyErr_Format(PyExc_TypeError,
3519 "encoder did not return a bytes object (type=%.400s)",
3520 Py_TYPE(v)->tp_name);
3521 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003522 return NULL;
3523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003529{
3530 PyObject *v;
3531
3532 if (!PyUnicode_Check(unicode)) {
3533 PyErr_BadArgument();
3534 goto onError;
3535 }
3536
3537 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003538 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003539
3540 /* Encode via the codec registry */
3541 v = PyCodec_Encode(unicode, encoding, errors);
3542 if (v == NULL)
3543 goto onError;
3544 if (!PyUnicode_Check(v)) {
3545 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003546 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003547 Py_TYPE(v)->tp_name);
3548 Py_DECREF(v);
3549 goto onError;
3550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003552
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 return NULL;
3555}
3556
Victor Stinner2f197072011-12-17 07:08:30 +01003557static size_t
3558mbstowcs_errorpos(const char *str, size_t len)
3559{
3560#ifdef HAVE_MBRTOWC
3561 const char *start = str;
3562 mbstate_t mbs;
3563 size_t converted;
3564 wchar_t ch;
3565
3566 memset(&mbs, 0, sizeof mbs);
3567 while (len)
3568 {
3569 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3570 if (converted == 0)
3571 /* Reached end of string */
3572 break;
3573 if (converted == (size_t)-1 || converted == (size_t)-2) {
3574 /* Conversion error or incomplete character */
3575 return str - start;
3576 }
3577 else {
3578 str += converted;
3579 len -= converted;
3580 }
3581 }
3582 /* failed to find the undecodable byte sequence */
3583 return 0;
3584#endif
3585 return 0;
3586}
3587
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003588PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003589PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003590 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003591{
3592 wchar_t smallbuf[256];
3593 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3594 wchar_t *wstr;
3595 size_t wlen, wlen2;
3596 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003597 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003598 size_t error_pos;
3599 char *errmsg;
3600 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003601
3602 if (locale_error_handler(errors, &surrogateescape) < 0)
3603 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003604
3605 if (str[len] != '\0' || len != strlen(str)) {
3606 PyErr_SetString(PyExc_TypeError, "embedded null character");
3607 return NULL;
3608 }
3609
3610 if (surrogateescape)
3611 {
3612 wstr = _Py_char2wchar(str, &wlen);
3613 if (wstr == NULL) {
3614 if (wlen == (size_t)-1)
3615 PyErr_NoMemory();
3616 else
3617 PyErr_SetFromErrno(PyExc_OSError);
3618 return NULL;
3619 }
3620
3621 unicode = PyUnicode_FromWideChar(wstr, wlen);
3622 PyMem_Free(wstr);
3623 }
3624 else {
3625#ifndef HAVE_BROKEN_MBSTOWCS
3626 wlen = mbstowcs(NULL, str, 0);
3627#else
3628 wlen = len;
3629#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003630 if (wlen == (size_t)-1)
3631 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003632 if (wlen+1 <= smallbuf_len) {
3633 wstr = smallbuf;
3634 }
3635 else {
3636 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3637 return PyErr_NoMemory();
3638
3639 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3640 if (!wstr)
3641 return PyErr_NoMemory();
3642 }
3643
3644 /* This shouldn't fail now */
3645 wlen2 = mbstowcs(wstr, str, wlen+1);
3646 if (wlen2 == (size_t)-1) {
3647 if (wstr != smallbuf)
3648 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003649 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003650 }
3651#ifdef HAVE_BROKEN_MBSTOWCS
3652 assert(wlen2 == wlen);
3653#endif
3654 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3655 if (wstr != smallbuf)
3656 PyMem_Free(wstr);
3657 }
3658 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003659
3660decode_error:
3661 errmsg = strerror(errno);
3662 assert(errmsg != NULL);
3663
3664 error_pos = mbstowcs_errorpos(str, len);
3665 if (errmsg != NULL) {
3666 size_t errlen;
3667 wstr = _Py_char2wchar(errmsg, &errlen);
3668 if (wstr != NULL) {
3669 reason = PyUnicode_FromWideChar(wstr, errlen);
3670 PyMem_Free(wstr);
3671 } else
3672 errmsg = NULL;
3673 }
3674 if (errmsg == NULL)
3675 reason = PyUnicode_FromString(
3676 "mbstowcs() encountered an invalid multibyte sequence");
3677 if (reason == NULL)
3678 return NULL;
3679
3680 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3681 "locale", str, len,
3682 (Py_ssize_t)error_pos,
3683 (Py_ssize_t)(error_pos+1),
3684 reason);
3685 Py_DECREF(reason);
3686 if (exc != NULL) {
3687 PyCodec_StrictErrors(exc);
3688 Py_XDECREF(exc);
3689 }
3690 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003691}
3692
3693PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003694PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695{
3696 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003697 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698}
3699
3700
3701PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003702PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003703 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003704 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3705}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003706
Christian Heimes5894ba72007-11-04 11:43:14 +00003707PyObject*
3708PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3709{
Victor Stinner99b95382011-07-04 14:23:54 +02003710#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003711 return PyUnicode_DecodeMBCS(s, size, NULL);
3712#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003713 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003714#else
Victor Stinner793b5312011-04-27 00:24:21 +02003715 PyInterpreterState *interp = PyThreadState_GET()->interp;
3716 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3717 cannot use it to encode and decode filenames before it is loaded. Load
3718 the Python codec requires to encode at least its own filename. Use the C
3719 version of the locale codec until the codec registry is initialized and
3720 the Python codec is loaded.
3721
3722 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3723 cannot only rely on it: check also interp->fscodec_initialized for
3724 subinterpreters. */
3725 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003726 return PyUnicode_Decode(s, size,
3727 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003728 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003729 }
3730 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003731 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003732 }
Victor Stinnerad158722010-10-27 00:25:46 +00003733#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003734}
3735
Martin v. Löwis011e8422009-05-05 04:43:17 +00003736
3737int
Antoine Pitrou13348842012-01-29 18:36:34 +01003738_PyUnicode_HasNULChars(PyObject* s)
3739{
3740 static PyObject *nul = NULL;
3741
3742 if (nul == NULL)
3743 nul = PyUnicode_FromStringAndSize("\0", 1);
3744 if (nul == NULL)
3745 return -1;
3746 return PyUnicode_Contains(s, nul);
3747}
3748
3749
3750int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003751PyUnicode_FSConverter(PyObject* arg, void* addr)
3752{
3753 PyObject *output = NULL;
3754 Py_ssize_t size;
3755 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003756 if (arg == NULL) {
3757 Py_DECREF(*(PyObject**)addr);
3758 return 1;
3759 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003760 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003761 output = arg;
3762 Py_INCREF(output);
3763 }
3764 else {
3765 arg = PyUnicode_FromObject(arg);
3766 if (!arg)
3767 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003768 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003769 Py_DECREF(arg);
3770 if (!output)
3771 return 0;
3772 if (!PyBytes_Check(output)) {
3773 Py_DECREF(output);
3774 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3775 return 0;
3776 }
3777 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003778 size = PyBytes_GET_SIZE(output);
3779 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003780 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003781 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 Py_DECREF(output);
3783 return 0;
3784 }
3785 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003786 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003787}
3788
3789
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003790int
3791PyUnicode_FSDecoder(PyObject* arg, void* addr)
3792{
3793 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003794 if (arg == NULL) {
3795 Py_DECREF(*(PyObject**)addr);
3796 return 1;
3797 }
3798 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003799 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003801 output = arg;
3802 Py_INCREF(output);
3803 }
3804 else {
3805 arg = PyBytes_FromObject(arg);
3806 if (!arg)
3807 return 0;
3808 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3809 PyBytes_GET_SIZE(arg));
3810 Py_DECREF(arg);
3811 if (!output)
3812 return 0;
3813 if (!PyUnicode_Check(output)) {
3814 Py_DECREF(output);
3815 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3816 return 0;
3817 }
3818 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003819 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003820 Py_DECREF(output);
3821 return 0;
3822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003824 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003825 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3826 Py_DECREF(output);
3827 return 0;
3828 }
3829 *(PyObject**)addr = output;
3830 return Py_CLEANUP_SUPPORTED;
3831}
3832
3833
Martin v. Löwis5b222132007-06-10 09:51:05 +00003834char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003836{
Christian Heimesf3863112007-11-22 07:46:41 +00003837 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003839 if (!PyUnicode_Check(unicode)) {
3840 PyErr_BadArgument();
3841 return NULL;
3842 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003843 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003844 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003846 if (PyUnicode_UTF8(unicode) == NULL) {
3847 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3849 if (bytes == NULL)
3850 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3852 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 Py_DECREF(bytes);
3854 return NULL;
3855 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003856 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3857 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3858 PyBytes_AS_STRING(bytes),
3859 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 Py_DECREF(bytes);
3861 }
3862
3863 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003864 *psize = PyUnicode_UTF8_LENGTH(unicode);
3865 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003866}
3867
3868char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3872}
3873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874Py_UNICODE *
3875PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 const unsigned char *one_byte;
3878#if SIZEOF_WCHAR_T == 4
3879 const Py_UCS2 *two_bytes;
3880#else
3881 const Py_UCS4 *four_bytes;
3882 const Py_UCS4 *ucs4_end;
3883 Py_ssize_t num_surrogates;
3884#endif
3885 wchar_t *w;
3886 wchar_t *wchar_end;
3887
3888 if (!PyUnicode_Check(unicode)) {
3889 PyErr_BadArgument();
3890 return NULL;
3891 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003892 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 assert(_PyUnicode_KIND(unicode) != 0);
3895 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003897 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3900 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 num_surrogates = 0;
3902
3903 for (; four_bytes < ucs4_end; ++four_bytes) {
3904 if (*four_bytes > 0xFFFF)
3905 ++num_surrogates;
3906 }
3907
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003908 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3909 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3910 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 PyErr_NoMemory();
3912 return NULL;
3913 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 w = _PyUnicode_WSTR(unicode);
3917 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3918 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3920 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003921 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003923 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3924 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925 }
3926 else
3927 *w = *four_bytes;
3928
3929 if (w > wchar_end) {
3930 assert(0 && "Miscalculated string end");
3931 }
3932 }
3933 *w = 0;
3934#else
3935 /* sizeof(wchar_t) == 4 */
3936 Py_FatalError("Impossible unicode object state, wstr and str "
3937 "should share memory already.");
3938 return NULL;
3939#endif
3940 }
3941 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003942 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3943 (_PyUnicode_LENGTH(unicode) + 1));
3944 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 PyErr_NoMemory();
3946 return NULL;
3947 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3949 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3950 w = _PyUnicode_WSTR(unicode);
3951 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3954 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 for (; w < wchar_end; ++one_byte, ++w)
3956 *w = *one_byte;
3957 /* null-terminate the wstr */
3958 *w = 0;
3959 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003960 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003962 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 for (; w < wchar_end; ++two_bytes, ++w)
3964 *w = *two_bytes;
3965 /* null-terminate the wstr */
3966 *w = 0;
3967#else
3968 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003969 PyObject_FREE(_PyUnicode_WSTR(unicode));
3970 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971 Py_FatalError("Impossible unicode object state, wstr "
3972 "and str should share memory already.");
3973 return NULL;
3974#endif
3975 }
3976 else {
3977 assert(0 && "This should never happen.");
3978 }
3979 }
3980 }
3981 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 *size = PyUnicode_WSTR_LENGTH(unicode);
3983 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003984}
3985
Alexander Belopolsky40018472011-02-26 01:02:56 +00003986Py_UNICODE *
3987PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990}
3991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992
Alexander Belopolsky40018472011-02-26 01:02:56 +00003993Py_ssize_t
3994PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995{
3996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 goto onError;
3999 }
4000 return PyUnicode_GET_SIZE(unicode);
4001
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return -1;
4004}
4005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006Py_ssize_t
4007PyUnicode_GetLength(PyObject *unicode)
4008{
Victor Stinner07621332012-06-16 04:53:46 +02004009 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 PyErr_BadArgument();
4011 return -1;
4012 }
Victor Stinner07621332012-06-16 04:53:46 +02004013 if (PyUnicode_READY(unicode) == -1)
4014 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 return PyUnicode_GET_LENGTH(unicode);
4016}
4017
4018Py_UCS4
4019PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4020{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004021 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4022 PyErr_BadArgument();
4023 return (Py_UCS4)-1;
4024 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004025 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004026 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 return (Py_UCS4)-1;
4028 }
4029 return PyUnicode_READ_CHAR(unicode, index);
4030}
4031
4032int
4033PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4034{
4035 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004036 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 return -1;
4038 }
Victor Stinner488fa492011-12-12 00:01:39 +01004039 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004040 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004041 PyErr_SetString(PyExc_IndexError, "string index out of range");
4042 return -1;
4043 }
Victor Stinner488fa492011-12-12 00:01:39 +01004044 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004045 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004046 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4047 PyErr_SetString(PyExc_ValueError, "character out of range");
4048 return -1;
4049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4051 index, ch);
4052 return 0;
4053}
4054
Alexander Belopolsky40018472011-02-26 01:02:56 +00004055const char *
4056PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004057{
Victor Stinner42cb4622010-09-01 19:39:01 +00004058 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004059}
4060
Victor Stinner554f3f02010-06-16 23:33:54 +00004061/* create or adjust a UnicodeDecodeError */
4062static void
4063make_decode_exception(PyObject **exceptionObject,
4064 const char *encoding,
4065 const char *input, Py_ssize_t length,
4066 Py_ssize_t startpos, Py_ssize_t endpos,
4067 const char *reason)
4068{
4069 if (*exceptionObject == NULL) {
4070 *exceptionObject = PyUnicodeDecodeError_Create(
4071 encoding, input, length, startpos, endpos, reason);
4072 }
4073 else {
4074 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4075 goto onError;
4076 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4077 goto onError;
4078 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4079 goto onError;
4080 }
4081 return;
4082
4083onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004084 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004085}
4086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004087/* error handling callback helper:
4088 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004089 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 and adjust various state variables.
4091 return 0 on success, -1 on error
4092*/
4093
Alexander Belopolsky40018472011-02-26 01:02:56 +00004094static int
4095unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004096 const char *encoding, const char *reason,
4097 const char **input, const char **inend, Py_ssize_t *startinpos,
4098 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004099 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004101 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102
4103 PyObject *restuple = NULL;
4104 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004105 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004106 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004107 Py_ssize_t requiredsize;
4108 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004109 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 int res = -1;
4111
Victor Stinner596a6c42011-11-09 00:02:18 +01004112 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4113 outsize = PyUnicode_GET_LENGTH(*output);
4114 else
4115 outsize = _PyUnicode_WSTR_LENGTH(*output);
4116
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004117 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 *errorHandler = PyCodec_LookupError(errors);
4119 if (*errorHandler == NULL)
4120 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 }
4122
Victor Stinner554f3f02010-06-16 23:33:54 +00004123 make_decode_exception(exceptionObject,
4124 encoding,
4125 *input, *inend - *input,
4126 *startinpos, *endinpos,
4127 reason);
4128 if (*exceptionObject == NULL)
4129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130
4131 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4132 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004135 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 }
4138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004140 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004141 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142
4143 /* Copy back the bytes variables, which might have been modified by the
4144 callback */
4145 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4146 if (!inputobj)
4147 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004148 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004151 *input = PyBytes_AS_STRING(inputobj);
4152 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004153 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004154 /* we can DECREF safely, as the exception has another reference,
4155 so the object won't go away. */
4156 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004160 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4162 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004163 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164
Victor Stinner596a6c42011-11-09 00:02:18 +01004165 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4166 /* need more space? (at least enough for what we
4167 have+the replacement+the rest of the string (starting
4168 at the new input position), so we won't have to check space
4169 when there are no errors in the rest of the string) */
4170 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4171 requiredsize = *outpos + replen + insize-newpos;
4172 if (requiredsize > outsize) {
4173 if (requiredsize<2*outsize)
4174 requiredsize = 2*outsize;
4175 if (unicode_resize(output, requiredsize) < 0)
4176 goto onError;
4177 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004178 if (unicode_widen(output, *outpos,
4179 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004181 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004182 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004184 else {
4185 wchar_t *repwstr;
4186 Py_ssize_t repwlen;
4187 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4188 if (repwstr == NULL)
4189 goto onError;
4190 /* need more space? (at least enough for what we
4191 have+the replacement+the rest of the string (starting
4192 at the new input position), so we won't have to check space
4193 when there are no errors in the rest of the string) */
4194 requiredsize = *outpos + repwlen + insize-newpos;
4195 if (requiredsize > outsize) {
4196 if (requiredsize < 2*outsize)
4197 requiredsize = 2*outsize;
4198 if (unicode_resize(output, requiredsize) < 0)
4199 goto onError;
4200 }
4201 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4202 *outpos += repwlen;
4203 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004205 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004206
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 /* we made it! */
4208 res = 0;
4209
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 Py_XDECREF(restuple);
4212 return res;
4213}
4214
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004215/* --- UTF-7 Codec -------------------------------------------------------- */
4216
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217/* See RFC2152 for details. We encode conservatively and decode liberally. */
4218
4219/* Three simple macros defining base-64. */
4220
4221/* Is c a base-64 character? */
4222
4223#define IS_BASE64(c) \
4224 (((c) >= 'A' && (c) <= 'Z') || \
4225 ((c) >= 'a' && (c) <= 'z') || \
4226 ((c) >= '0' && (c) <= '9') || \
4227 (c) == '+' || (c) == '/')
4228
4229/* given that c is a base-64 character, what is its base-64 value? */
4230
4231#define FROM_BASE64(c) \
4232 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4233 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4234 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4235 (c) == '+' ? 62 : 63)
4236
4237/* What is the base-64 character of the bottom 6 bits of n? */
4238
4239#define TO_BASE64(n) \
4240 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4241
4242/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4243 * decoded as itself. We are permissive on decoding; the only ASCII
4244 * byte not decoding to itself is the + which begins a base64
4245 * string. */
4246
4247#define DECODE_DIRECT(c) \
4248 ((c) <= 127 && (c) != '+')
4249
4250/* The UTF-7 encoder treats ASCII characters differently according to
4251 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4252 * the above). See RFC2152. This array identifies these different
4253 * sets:
4254 * 0 : "Set D"
4255 * alphanumeric and '(),-./:?
4256 * 1 : "Set O"
4257 * !"#$%&*;<=>@[]^_`{|}
4258 * 2 : "whitespace"
4259 * ht nl cr sp
4260 * 3 : special (must be base64 encoded)
4261 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4262 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263
Tim Petersced69f82003-09-16 20:30:58 +00004264static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265char utf7_category[128] = {
4266/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4267 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4268/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4269 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4270/* sp ! " # $ % & ' ( ) * + , - . / */
4271 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4272/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4274/* @ A B C D E F G H I J K L M N O */
4275 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4276/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4277 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4278/* ` a b c d e f g h i j k l m n o */
4279 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4280/* p q r s t u v w x y z { | } ~ del */
4281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282};
4283
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284/* ENCODE_DIRECT: this character should be encoded as itself. The
4285 * answer depends on whether we are encoding set O as itself, and also
4286 * on whether we are encoding whitespace as itself. RFC2152 makes it
4287 * clear that the answers to these questions vary between
4288 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290#define ENCODE_DIRECT(c, directO, directWS) \
4291 ((c) < 128 && (c) > 0 && \
4292 ((utf7_category[(c)] == 0) || \
4293 (directWS && (utf7_category[(c)] == 2)) || \
4294 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295
Alexander Belopolsky40018472011-02-26 01:02:56 +00004296PyObject *
4297PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004298 Py_ssize_t size,
4299 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004301 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4302}
4303
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304/* The decoder. The only state we preserve is our read position,
4305 * i.e. how many characters we have consumed. So if we end in the
4306 * middle of a shift sequence we have to back off the read position
4307 * and the output to the beginning of the sequence, otherwise we lose
4308 * all the shift state (seen bits, number of bits seen, high
4309 * surrogate). */
4310
Alexander Belopolsky40018472011-02-26 01:02:56 +00004311PyObject *
4312PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004313 Py_ssize_t size,
4314 const char *errors,
4315 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004316{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 Py_ssize_t startinpos;
4319 Py_ssize_t endinpos;
4320 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004322 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 const char *errmsg = "";
4324 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004325 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004326 unsigned int base64bits = 0;
4327 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004328 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329 PyObject *errorHandler = NULL;
4330 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004331
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004332 /* Start off assuming it's all ASCII. Widen later as necessary. */
4333 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 if (!unicode)
4335 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004336 if (size == 0) {
4337 if (consumed)
4338 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004339 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004340 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004341
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004342 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 e = s + size;
4344
4345 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004346 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004348 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 if (inShift) { /* in a base-64 section */
4351 if (IS_BASE64(ch)) { /* consume a base-64 character */
4352 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4353 base64bits += 6;
4354 s++;
4355 if (base64bits >= 16) {
4356 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004357 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 base64bits -= 16;
4359 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004360 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 if (surrogate) {
4362 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004363 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4364 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4366 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004368 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 }
4370 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004371 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4372 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 }
4375 }
Victor Stinner551ac952011-11-29 22:58:13 +01004376 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 /* first surrogate */
4378 surrogate = outCh;
4379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004381 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4382 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 }
4384 }
4385 }
4386 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 inShift = 0;
4388 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004390 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4391 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004392 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 if (base64bits > 0) { /* left-over bits */
4395 if (base64bits >= 6) {
4396 /* We've seen at least one base-64 character */
4397 errmsg = "partial character in shift sequence";
4398 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else {
4401 /* Some bits remain; they should be zero */
4402 if (base64buffer != 0) {
4403 errmsg = "non-zero padding bits in shift sequence";
4404 goto utf7Error;
4405 }
4406 }
4407 }
4408 if (ch != '-') {
4409 /* '-' is absorbed; other terminating
4410 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4412 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
4416 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 s++; /* consume '+' */
4419 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004421 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4422 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 }
4424 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004426 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004428 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 }
4430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4433 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 s++;
4435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 else {
4437 startinpos = s-starts;
4438 s++;
4439 errmsg = "unexpected special character";
4440 goto utf7Error;
4441 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 endinpos = s-starts;
4445 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 errors, &errorHandler,
4447 "utf7", errmsg,
4448 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 }
4452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 /* end of string */
4454
4455 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4456 /* if we're in an inconsistent state, that's an error */
4457 if (surrogate ||
4458 (base64bits >= 6) ||
4459 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 endinpos = size;
4461 if (unicode_decode_call_errorhandler(
4462 errors, &errorHandler,
4463 "utf7", "unterminated shift sequence",
4464 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004465 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 goto onError;
4467 if (s < e)
4468 goto restart;
4469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471
4472 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 *consumed = startinpos;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004476 if (outpos != shiftOutStart &&
4477 PyUnicode_MAX_CHAR_VALUE(unicode) > 127) {
4478 PyObject *result = PyUnicode_FromKindAndData(
4479 PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4480 shiftOutStart);
4481 Py_DECREF(unicode);
4482 unicode = result;
4483 }
4484 outpos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485 }
4486 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004487 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004489 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004491 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 goto onError;
4493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 Py_XDECREF(errorHandler);
4495 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004496 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 Py_XDECREF(errorHandler);
4500 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 Py_DECREF(unicode);
4502 return NULL;
4503}
4504
4505
Alexander Belopolsky40018472011-02-26 01:02:56 +00004506PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004507_PyUnicode_EncodeUTF7(PyObject *str,
4508 int base64SetO,
4509 int base64WhiteSpace,
4510 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004512 int kind;
4513 void *data;
4514 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004515 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004517 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 unsigned int base64bits = 0;
4519 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 char * out;
4521 char * start;
4522
Benjamin Petersonbac79492012-01-14 13:34:47 -05004523 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 return NULL;
4525 kind = PyUnicode_KIND(str);
4526 data = PyUnicode_DATA(str);
4527 len = PyUnicode_GET_LENGTH(str);
4528
4529 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004532 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004533 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004534 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004535 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 if (v == NULL)
4537 return NULL;
4538
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004539 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004540 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004541 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (inShift) {
4544 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4545 /* shifting out */
4546 if (base64bits) { /* output remaining bits */
4547 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4548 base64buffer = 0;
4549 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 }
4551 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 /* Characters not in the BASE64 set implicitly unshift the sequence
4553 so no '-' is required, except if the character is itself a '-' */
4554 if (IS_BASE64(ch) || ch == '-') {
4555 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 *out++ = (char) ch;
4558 }
4559 else {
4560 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004561 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 else { /* not in a shift sequence */
4564 if (ch == '+') {
4565 *out++ = '+';
4566 *out++ = '-';
4567 }
4568 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4569 *out++ = (char) ch;
4570 }
4571 else {
4572 *out++ = '+';
4573 inShift = 1;
4574 goto encode_char;
4575 }
4576 }
4577 continue;
4578encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004580 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004581
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* code first surrogate */
4583 base64bits += 16;
4584 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4585 while (base64bits >= 6) {
4586 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4587 base64bits -= 6;
4588 }
4589 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004590 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 base64bits += 16;
4593 base64buffer = (base64buffer << 16) | ch;
4594 while (base64bits >= 6) {
4595 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4596 base64bits -= 6;
4597 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004598 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 if (base64bits)
4600 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4601 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004603 if (_PyBytes_Resize(&v, out - start) < 0)
4604 return NULL;
4605 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004607PyObject *
4608PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4609 Py_ssize_t size,
4610 int base64SetO,
4611 int base64WhiteSpace,
4612 const char *errors)
4613{
4614 PyObject *result;
4615 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4616 if (tmp == NULL)
4617 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004618 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004619 base64WhiteSpace, errors);
4620 Py_DECREF(tmp);
4621 return result;
4622}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624#undef IS_BASE64
4625#undef FROM_BASE64
4626#undef TO_BASE64
4627#undef DECODE_DIRECT
4628#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630/* --- UTF-8 Codec -------------------------------------------------------- */
4631
Alexander Belopolsky40018472011-02-26 01:02:56 +00004632PyObject *
4633PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004634 Py_ssize_t size,
4635 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636{
Walter Dörwald69652032004-09-07 20:24:22 +00004637 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4638}
4639
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640#include "stringlib/asciilib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644#include "stringlib/ucs1lib.h"
4645#include "stringlib/codecs.h"
4646#include "stringlib/undef.h"
4647
4648#include "stringlib/ucs2lib.h"
4649#include "stringlib/codecs.h"
4650#include "stringlib/undef.h"
4651
4652#include "stringlib/ucs4lib.h"
4653#include "stringlib/codecs.h"
4654#include "stringlib/undef.h"
4655
Antoine Pitrouab868312009-01-10 15:40:25 +00004656/* Mask to quickly check whether a C 'long' contains a
4657 non-ASCII, UTF8-encoded char. */
4658#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004659# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004660#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004661# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004662#else
4663# error C 'long' size should be either 4 or 8!
4664#endif
4665
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666static Py_ssize_t
4667ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004668{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004670 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004671
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004672 /*
4673 * Issue #17237: m68k is a bit different from most architectures in
4674 * that objects do not use "natural alignment" - for example, int and
4675 * long are only aligned at 2-byte boundaries. Therefore the assert()
4676 * won't work; also, tests have shown that skipping the "optimised
4677 * version" will even speed up m68k.
4678 */
4679#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004681 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4682 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 /* Fast path, see in STRINGLIB(utf8_decode) for
4684 an explanation. */
4685 /* Help register allocation */
4686 register const char *_p = p;
4687 register Py_UCS1 * q = dest;
4688 while (_p < aligned_end) {
4689 unsigned long value = *(const unsigned long *) _p;
4690 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004692 *((unsigned long *)q) = value;
4693 _p += SIZEOF_LONG;
4694 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004695 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696 p = _p;
4697 while (p < end) {
4698 if ((unsigned char)*p & 0x80)
4699 break;
4700 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004704#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004705#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004706 while (p < end) {
4707 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4708 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004709 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710 /* Help register allocation */
4711 register const char *_p = p;
4712 while (_p < aligned_end) {
4713 unsigned long value = *(unsigned long *) _p;
4714 if (value & ASCII_CHAR_MASK)
4715 break;
4716 _p += SIZEOF_LONG;
4717 }
4718 p = _p;
4719 if (_p == end)
4720 break;
4721 }
4722 if ((unsigned char)*p & 0x80)
4723 break;
4724 ++p;
4725 }
4726 memcpy(dest, start, p - start);
4727 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728}
Antoine Pitrouab868312009-01-10 15:40:25 +00004729
Victor Stinner785938e2011-12-11 20:09:03 +01004730PyObject *
4731PyUnicode_DecodeUTF8Stateful(const char *s,
4732 Py_ssize_t size,
4733 const char *errors,
4734 Py_ssize_t *consumed)
4735{
Victor Stinner785938e2011-12-11 20:09:03 +01004736 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004737 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004738 const char *end = s + size;
4739 Py_ssize_t outpos;
4740
4741 Py_ssize_t startinpos;
4742 Py_ssize_t endinpos;
4743 const char *errmsg = "";
4744 PyObject *errorHandler = NULL;
4745 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004746
4747 if (size == 0) {
4748 if (consumed)
4749 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004750 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004751 }
4752
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4754 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004755 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 *consumed = 1;
4757 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004758 }
4759
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004761 if (!unicode)
4762 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004763
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4765 s += outpos;
4766 while (s < end) {
4767 Py_UCS4 ch;
4768 int kind = PyUnicode_KIND(unicode);
4769 if (kind == PyUnicode_1BYTE_KIND) {
4770 if (PyUnicode_IS_ASCII(unicode))
4771 ch = asciilib_utf8_decode(&s, end,
4772 PyUnicode_1BYTE_DATA(unicode), &outpos);
4773 else
4774 ch = ucs1lib_utf8_decode(&s, end,
4775 PyUnicode_1BYTE_DATA(unicode), &outpos);
4776 } else if (kind == PyUnicode_2BYTE_KIND) {
4777 ch = ucs2lib_utf8_decode(&s, end,
4778 PyUnicode_2BYTE_DATA(unicode), &outpos);
4779 } else {
4780 assert(kind == PyUnicode_4BYTE_KIND);
4781 ch = ucs4lib_utf8_decode(&s, end,
4782 PyUnicode_4BYTE_DATA(unicode), &outpos);
4783 }
4784
4785 switch (ch) {
4786 case 0:
4787 if (s == end || consumed)
4788 goto End;
4789 errmsg = "unexpected end of data";
4790 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004791 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792 break;
4793 case 1:
4794 errmsg = "invalid start byte";
4795 startinpos = s - starts;
4796 endinpos = startinpos + 1;
4797 break;
4798 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004799 case 3:
4800 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801 errmsg = "invalid continuation byte";
4802 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004803 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 break;
4805 default:
4806 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4807 goto onError;
4808 continue;
4809 }
4810
4811 if (unicode_decode_call_errorhandler(
4812 errors, &errorHandler,
4813 "utf-8", errmsg,
4814 &starts, &end, &startinpos, &endinpos, &exc, &s,
4815 &unicode, &outpos))
4816 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004817 }
4818
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004819End:
4820 if (unicode_resize(&unicode, outpos) < 0)
4821 goto onError;
4822
4823 if (consumed)
4824 *consumed = s - starts;
4825
4826 Py_XDECREF(errorHandler);
4827 Py_XDECREF(exc);
4828 assert(_PyUnicode_CheckConsistency(unicode, 1));
4829 return unicode;
4830
4831onError:
4832 Py_XDECREF(errorHandler);
4833 Py_XDECREF(exc);
4834 Py_XDECREF(unicode);
4835 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004836}
4837
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838#ifdef __APPLE__
4839
4840/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004841 used to decode the command line arguments on Mac OS X.
4842
4843 Return a pointer to a newly allocated wide character string (use
4844 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845
4846wchar_t*
4847_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4848{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 wchar_t *unicode;
4851 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852
4853 /* Note: size will always be longer than the resulting Unicode
4854 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004857 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4858 if (!unicode)
4859 return NULL;
4860
4861 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004862 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004864 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004865 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004866#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004868#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004870#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 if (ch > 0xFF) {
4872#if SIZEOF_WCHAR_T == 4
4873 assert(0);
4874#else
4875 assert(Py_UNICODE_IS_SURROGATE(ch));
4876 /* compute and append the two surrogates: */
4877 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4878 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4879#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004880 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 else {
4882 if (!ch && s == e)
4883 break;
4884 /* surrogateescape */
4885 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4886 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004887 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004889 return unicode;
4890}
4891
4892#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004894/* Primary internal function which creates utf8 encoded bytes objects.
4895
4896 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004897 and allocate exactly as much space needed at the end. Else allocate the
4898 maximum possible needed (4 result bytes per Unicode character), and return
4899 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004900*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004901PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004902_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903{
Victor Stinner6099a032011-12-18 14:22:26 +01004904 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905 void *data;
4906 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004908 if (!PyUnicode_Check(unicode)) {
4909 PyErr_BadArgument();
4910 return NULL;
4911 }
4912
4913 if (PyUnicode_READY(unicode) == -1)
4914 return NULL;
4915
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004916 if (PyUnicode_UTF8(unicode))
4917 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4918 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004919
4920 kind = PyUnicode_KIND(unicode);
4921 data = PyUnicode_DATA(unicode);
4922 size = PyUnicode_GET_LENGTH(unicode);
4923
Benjamin Petersonead6b532011-12-20 17:23:42 -06004924 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004925 default:
4926 assert(0);
4927 case PyUnicode_1BYTE_KIND:
4928 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4929 assert(!PyUnicode_IS_ASCII(unicode));
4930 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4931 case PyUnicode_2BYTE_KIND:
4932 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4933 case PyUnicode_4BYTE_KIND:
4934 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936}
4937
Alexander Belopolsky40018472011-02-26 01:02:56 +00004938PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4940 Py_ssize_t size,
4941 const char *errors)
4942{
4943 PyObject *v, *unicode;
4944
4945 unicode = PyUnicode_FromUnicode(s, size);
4946 if (unicode == NULL)
4947 return NULL;
4948 v = _PyUnicode_AsUTF8String(unicode, errors);
4949 Py_DECREF(unicode);
4950 return v;
4951}
4952
4953PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004954PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004956 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957}
4958
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959/* --- UTF-32 Codec ------------------------------------------------------- */
4960
4961PyObject *
4962PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_ssize_t size,
4964 const char *errors,
4965 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966{
4967 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4968}
4969
4970PyObject *
4971PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 Py_ssize_t size,
4973 const char *errors,
4974 int *byteorder,
4975 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976{
4977 const char *starts = s;
4978 Py_ssize_t startinpos;
4979 Py_ssize_t endinpos;
4980 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004981 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004982 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983 int bo = 0; /* assume native ordering by default */
4984 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985 /* Offsets from q for retrieving bytes in the right order. */
4986#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4987 int iorder[] = {0, 1, 2, 3};
4988#else
4989 int iorder[] = {3, 2, 1, 0};
4990#endif
4991 PyObject *errorHandler = NULL;
4992 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004993
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994 q = (unsigned char *)s;
4995 e = q + size;
4996
4997 if (byteorder)
4998 bo = *byteorder;
4999
5000 /* Check for BOM marks (U+FEFF) in the input and adjust current
5001 byte order setting accordingly. In native mode, the leading BOM
5002 mark is skipped, in all other modes, it is copied to the output
5003 stream as-is (giving a ZWNBSP character). */
5004 if (bo == 0) {
5005 if (size >= 4) {
5006 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 if (bom == 0x0000FEFF) {
5010 q += 4;
5011 bo = -1;
5012 }
5013 else if (bom == 0xFFFE0000) {
5014 q += 4;
5015 bo = 1;
5016 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005017#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 if (bom == 0x0000FEFF) {
5019 q += 4;
5020 bo = 1;
5021 }
5022 else if (bom == 0xFFFE0000) {
5023 q += 4;
5024 bo = -1;
5025 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028 }
5029
5030 if (bo == -1) {
5031 /* force LE */
5032 iorder[0] = 0;
5033 iorder[1] = 1;
5034 iorder[2] = 2;
5035 iorder[3] = 3;
5036 }
5037 else if (bo == 1) {
5038 /* force BE */
5039 iorder[0] = 3;
5040 iorder[1] = 2;
5041 iorder[2] = 1;
5042 iorder[3] = 0;
5043 }
5044
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005045 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005046 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005047 if (!unicode)
5048 return NULL;
5049 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005050 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005051 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005052
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 Py_UCS4 ch;
5055 /* remaining bytes at the end? (size should be divisible by 4) */
5056 if (e-q<4) {
5057 if (consumed)
5058 break;
5059 errmsg = "truncated data";
5060 startinpos = ((const char *)q)-starts;
5061 endinpos = ((const char *)e)-starts;
5062 goto utf32Error;
5063 /* The remaining input chars are ignored if the callback
5064 chooses to skip the input */
5065 }
5066 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5067 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 if (ch >= 0x110000)
5070 {
5071 errmsg = "codepoint not in range(0x110000)";
5072 startinpos = ((const char *)q)-starts;
5073 endinpos = startinpos+4;
5074 goto utf32Error;
5075 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005076 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5077 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 q += 4;
5079 continue;
5080 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 if (unicode_decode_call_errorhandler(
5082 errors, &errorHandler,
5083 "utf32", errmsg,
5084 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005085 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087 }
5088
5089 if (byteorder)
5090 *byteorder = bo;
5091
5092 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094
5095 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005096 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097 goto onError;
5098
5099 Py_XDECREF(errorHandler);
5100 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005101 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 Py_DECREF(unicode);
5105 Py_XDECREF(errorHandler);
5106 Py_XDECREF(exc);
5107 return NULL;
5108}
5109
5110PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005111_PyUnicode_EncodeUTF32(PyObject *str,
5112 const char *errors,
5113 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005115 int kind;
5116 void *data;
5117 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005118 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005120 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121 /* Offsets from p for storing byte pairs in the right order. */
5122#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5123 int iorder[] = {0, 1, 2, 3};
5124#else
5125 int iorder[] = {3, 2, 1, 0};
5126#endif
5127
Benjamin Peterson29060642009-01-31 22:14:21 +00005128#define STORECHAR(CH) \
5129 do { \
5130 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5131 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5132 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5133 p[iorder[0]] = (CH) & 0xff; \
5134 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135 } while(0)
5136
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005137 if (!PyUnicode_Check(str)) {
5138 PyErr_BadArgument();
5139 return NULL;
5140 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005141 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005142 return NULL;
5143 kind = PyUnicode_KIND(str);
5144 data = PyUnicode_DATA(str);
5145 len = PyUnicode_GET_LENGTH(str);
5146
5147 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005148 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005150 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151 if (v == NULL)
5152 return NULL;
5153
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005154 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005157 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005158 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159
5160 if (byteorder == -1) {
5161 /* force LE */
5162 iorder[0] = 0;
5163 iorder[1] = 1;
5164 iorder[2] = 2;
5165 iorder[3] = 3;
5166 }
5167 else if (byteorder == 1) {
5168 /* force BE */
5169 iorder[0] = 3;
5170 iorder[1] = 2;
5171 iorder[2] = 1;
5172 iorder[3] = 0;
5173 }
5174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005175 for (i = 0; i < len; i++)
5176 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177
5178 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005179 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005180#undef STORECHAR
5181}
5182
Alexander Belopolsky40018472011-02-26 01:02:56 +00005183PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005184PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5185 Py_ssize_t size,
5186 const char *errors,
5187 int byteorder)
5188{
5189 PyObject *result;
5190 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5191 if (tmp == NULL)
5192 return NULL;
5193 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5194 Py_DECREF(tmp);
5195 return result;
5196}
5197
5198PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005199PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200{
Victor Stinnerb960b342011-11-20 19:12:52 +01005201 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005202}
5203
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204/* --- UTF-16 Codec ------------------------------------------------------- */
5205
Tim Peters772747b2001-08-09 22:21:55 +00005206PyObject *
5207PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_ssize_t size,
5209 const char *errors,
5210 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211{
Walter Dörwald69652032004-09-07 20:24:22 +00005212 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5213}
5214
5215PyObject *
5216PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 Py_ssize_t size,
5218 const char *errors,
5219 int *byteorder,
5220 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005221{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005223 Py_ssize_t startinpos;
5224 Py_ssize_t endinpos;
5225 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005226 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005228 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005230 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 PyObject *errorHandler = NULL;
5232 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233
Tim Peters772747b2001-08-09 22:21:55 +00005234 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005235 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236
5237 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005238 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005240 /* Check for BOM marks (U+FEFF) in the input and adjust current
5241 byte order setting accordingly. In native mode, the leading BOM
5242 mark is skipped, in all other modes, it is copied to the output
5243 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244 if (bo == 0 && size >= 2) {
5245 const Py_UCS4 bom = (q[1] << 8) | q[0];
5246 if (bom == 0xFEFF) {
5247 q += 2;
5248 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005250 else if (bom == 0xFFFE) {
5251 q += 2;
5252 bo = 1;
5253 }
5254 if (byteorder)
5255 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Antoine Pitrou63065d72012-05-15 23:48:04 +02005258 if (q == e) {
5259 if (consumed)
5260 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005261 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005262 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005263
Antoine Pitrouab868312009-01-10 15:40:25 +00005264#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005265 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005266#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005267 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005268#endif
Tim Peters772747b2001-08-09 22:21:55 +00005269
Antoine Pitrou63065d72012-05-15 23:48:04 +02005270 /* Note: size will always be longer than the resulting Unicode
5271 character count */
5272 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5273 if (!unicode)
5274 return NULL;
5275
5276 outpos = 0;
5277 while (1) {
5278 Py_UCS4 ch = 0;
5279 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005280 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005281 if (kind == PyUnicode_1BYTE_KIND) {
5282 if (PyUnicode_IS_ASCII(unicode))
5283 ch = asciilib_utf16_decode(&q, e,
5284 PyUnicode_1BYTE_DATA(unicode), &outpos,
5285 native_ordering);
5286 else
5287 ch = ucs1lib_utf16_decode(&q, e,
5288 PyUnicode_1BYTE_DATA(unicode), &outpos,
5289 native_ordering);
5290 } else if (kind == PyUnicode_2BYTE_KIND) {
5291 ch = ucs2lib_utf16_decode(&q, e,
5292 PyUnicode_2BYTE_DATA(unicode), &outpos,
5293 native_ordering);
5294 } else {
5295 assert(kind == PyUnicode_4BYTE_KIND);
5296 ch = ucs4lib_utf16_decode(&q, e,
5297 PyUnicode_4BYTE_DATA(unicode), &outpos,
5298 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005299 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005300 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005301
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 switch (ch)
5303 {
5304 case 0:
5305 /* remaining byte at the end? (size should be even) */
5306 if (q == e || consumed)
5307 goto End;
5308 errmsg = "truncated data";
5309 startinpos = ((const char *)q) - starts;
5310 endinpos = ((const char *)e) - starts;
5311 break;
5312 /* The remaining input chars are ignored if the callback
5313 chooses to skip the input */
5314 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005315 q -= 2;
5316 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005317 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005319 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005320 endinpos = ((const char *)e) - starts;
5321 break;
5322 case 2:
5323 errmsg = "illegal encoding";
5324 startinpos = ((const char *)q) - 2 - starts;
5325 endinpos = startinpos + 2;
5326 break;
5327 case 3:
5328 errmsg = "illegal UTF-16 surrogate";
5329 startinpos = ((const char *)q) - 4 - starts;
5330 endinpos = startinpos + 2;
5331 break;
5332 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005333 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5334 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 continue;
5336 }
5337
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005339 errors,
5340 &errorHandler,
5341 "utf16", errmsg,
5342 &starts,
5343 (const char **)&e,
5344 &startinpos,
5345 &endinpos,
5346 &exc,
5347 (const char **)&q,
5348 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005349 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 }
5352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353End:
Walter Dörwald69652032004-09-07 20:24:22 +00005354 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005356
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005358 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 goto onError;
5360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 Py_XDECREF(errorHandler);
5362 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005363 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 Py_XDECREF(errorHandler);
5368 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 return NULL;
5370}
5371
Tim Peters772747b2001-08-09 22:21:55 +00005372PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373_PyUnicode_EncodeUTF16(PyObject *str,
5374 const char *errors,
5375 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005377 enum PyUnicode_Kind kind;
5378 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005381 unsigned short *out;
5382 Py_ssize_t bytesize;
5383 Py_ssize_t pairs;
5384#ifdef WORDS_BIGENDIAN
5385 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005386#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005387 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005388#endif
5389
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390 if (!PyUnicode_Check(str)) {
5391 PyErr_BadArgument();
5392 return NULL;
5393 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005394 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005395 return NULL;
5396 kind = PyUnicode_KIND(str);
5397 data = PyUnicode_DATA(str);
5398 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005399
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005401 if (kind == PyUnicode_4BYTE_KIND) {
5402 const Py_UCS4 *in = (const Py_UCS4 *)data;
5403 const Py_UCS4 *end = in + len;
5404 while (in < end)
5405 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005406 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005407 }
5408 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005410 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005411 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 if (v == NULL)
5413 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005415 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005416 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005417 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005419 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005420 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005421 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005422
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005423 switch (kind) {
5424 case PyUnicode_1BYTE_KIND: {
5425 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5426 break;
Tim Peters772747b2001-08-09 22:21:55 +00005427 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005428 case PyUnicode_2BYTE_KIND: {
5429 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5430 break;
Tim Peters772747b2001-08-09 22:21:55 +00005431 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 case PyUnicode_4BYTE_KIND: {
5433 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5434 break;
5435 }
5436 default:
5437 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005438 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005439
5440 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005441 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
Alexander Belopolsky40018472011-02-26 01:02:56 +00005444PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5446 Py_ssize_t size,
5447 const char *errors,
5448 int byteorder)
5449{
5450 PyObject *result;
5451 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5452 if (tmp == NULL)
5453 return NULL;
5454 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5455 Py_DECREF(tmp);
5456 return result;
5457}
5458
5459PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005460PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005462 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463}
5464
5465/* --- Unicode Escape Codec ----------------------------------------------- */
5466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5468 if all the escapes in the string make it still a valid ASCII string.
5469 Returns -1 if any escapes were found which cause the string to
5470 pop out of ASCII range. Otherwise returns the length of the
5471 required buffer to hold the string.
5472 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005473static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5475{
5476 const unsigned char *p = (const unsigned char *)s;
5477 const unsigned char *end = p + size;
5478 Py_ssize_t length = 0;
5479
5480 if (size < 0)
5481 return -1;
5482
5483 for (; p < end; ++p) {
5484 if (*p > 127) {
5485 /* Non-ASCII */
5486 return -1;
5487 }
5488 else if (*p != '\\') {
5489 /* Normal character */
5490 ++length;
5491 }
5492 else {
5493 /* Backslash-escape, check next char */
5494 ++p;
5495 /* Escape sequence reaches till end of string or
5496 non-ASCII follow-up. */
5497 if (p >= end || *p > 127)
5498 return -1;
5499 switch (*p) {
5500 case '\n':
5501 /* backslash + \n result in zero characters */
5502 break;
5503 case '\\': case '\'': case '\"':
5504 case 'b': case 'f': case 't':
5505 case 'n': case 'r': case 'v': case 'a':
5506 ++length;
5507 break;
5508 case '0': case '1': case '2': case '3':
5509 case '4': case '5': case '6': case '7':
5510 case 'x': case 'u': case 'U': case 'N':
5511 /* these do not guarantee ASCII characters */
5512 return -1;
5513 default:
5514 /* count the backslash + the other character */
5515 length += 2;
5516 }
5517 }
5518 }
5519 return length;
5520}
5521
Fredrik Lundh06d12682001-01-24 07:59:11 +00005522static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005523
Alexander Belopolsky40018472011-02-26 01:02:56 +00005524PyObject *
5525PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005526 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005530 Py_ssize_t startinpos;
5531 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005532 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005534 char* message;
5535 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536 PyObject *errorHandler = NULL;
5537 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005538 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005539 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005540
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005541 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542
5543 /* After length_of_escaped_ascii_string() there are two alternatives,
5544 either the string is pure ASCII with named escapes like \n, etc.
5545 and we determined it's exact size (common case)
5546 or it contains \x, \u, ... escape sequences. then we create a
5547 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005548 if (len >= 0) {
5549 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 if (!v)
5551 goto onError;
5552 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553 }
5554 else {
5555 /* Escaped strings will always be longer than the resulting
5556 Unicode string, so we start with size here and then reduce the
5557 length after conversion to the true value.
5558 (but if the error callback returns a long replacement string
5559 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005560 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 if (!v)
5562 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005563 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564 }
5565
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005567 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005568 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005570
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 while (s < end) {
5572 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005573 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005576 /* The only case in which i == ascii_length is a backslash
5577 followed by a newline. */
5578 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 /* Non-escape characters are interpreted as Unicode ordinals */
5581 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005582 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5583 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 continue;
5585 }
5586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 /* \ - Escapes */
5589 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005590 c = *s++;
5591 if (s > end)
5592 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005594 /* The only case in which i == ascii_length is a backslash
5595 followed by a newline. */
5596 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005597
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005598 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601#define WRITECHAR(ch) \
5602 do { \
5603 if (unicode_putchar(&v, &i, ch) < 0) \
5604 goto onError; \
5605 }while(0)
5606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005608 case '\\': WRITECHAR('\\'); break;
5609 case '\'': WRITECHAR('\''); break;
5610 case '\"': WRITECHAR('\"'); break;
5611 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005612 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005613 case 'f': WRITECHAR('\014'); break;
5614 case 't': WRITECHAR('\t'); break;
5615 case 'n': WRITECHAR('\n'); break;
5616 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005617 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005618 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005619 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 case '0': case '1': case '2': case '3':
5624 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005625 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005626 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005627 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005628 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005629 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005631 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 break;
5633
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* hex escapes */
5635 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005637 digits = 2;
5638 message = "truncated \\xXX escape";
5639 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005643 digits = 4;
5644 message = "truncated \\uXXXX escape";
5645 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005648 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005649 digits = 8;
5650 message = "truncated \\UXXXXXXXX escape";
5651 hexescape:
5652 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005653 if (end - s < digits) {
5654 /* count only hex digits */
5655 for (; s < end; ++s) {
5656 c = (unsigned char)*s;
5657 if (!Py_ISXDIGIT(c))
5658 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005659 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005660 goto error;
5661 }
5662 for (; digits--; ++s) {
5663 c = (unsigned char)*s;
5664 if (!Py_ISXDIGIT(c))
5665 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005666 chr = (chr<<4) & ~0xF;
5667 if (c >= '0' && c <= '9')
5668 chr += c - '0';
5669 else if (c >= 'a' && c <= 'f')
5670 chr += 10 + c - 'a';
5671 else
5672 chr += 10 + c - 'A';
5673 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005674 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 /* _decoding_error will have already written into the
5676 target buffer. */
5677 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005678 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005679 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005680 message = "illegal Unicode character";
5681 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005682 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005683 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005684 break;
5685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687 case 'N':
5688 message = "malformed \\N character escape";
5689 if (ucnhash_CAPI == NULL) {
5690 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5692 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005693 if (ucnhash_CAPI == NULL)
5694 goto ucnhashError;
5695 }
5696 if (*s == '{') {
5697 const char *start = s+1;
5698 /* look for the closing brace */
5699 while (*s != '}' && s < end)
5700 s++;
5701 if (s > start && s < end && *s == '}') {
5702 /* found a name. look it up in the unicode database */
5703 message = "unknown Unicode character name";
5704 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005705 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005706 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005707 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005708 goto store;
5709 }
5710 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005711 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005712
5713 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005714 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 message = "\\ at end of string";
5716 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005717 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005718 }
5719 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005720 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005721 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005722 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005723 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005725 continue;
5726
5727 error:
5728 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005729 if (unicode_decode_call_errorhandler(
5730 errors, &errorHandler,
5731 "unicodeescape", message,
5732 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005733 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005734 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005735 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005736 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005739
Victor Stinner16e6a802011-12-12 13:24:15 +01005740 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005741 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005742 Py_XDECREF(errorHandler);
5743 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005744 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005747 PyErr_SetString(
5748 PyExc_UnicodeError,
5749 "\\N escapes not supported (can't load unicodedata module)"
5750 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005751 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 Py_XDECREF(errorHandler);
5753 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005754 return NULL;
5755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 Py_XDECREF(errorHandler);
5759 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 return NULL;
5761}
5762
5763/* Return a Unicode-Escape string version of the Unicode object.
5764
5765 If quotes is true, the string is enclosed in u"" or u'' quotes as
5766 appropriate.
5767
5768*/
5769
Alexander Belopolsky40018472011-02-26 01:02:56 +00005770PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005771PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005773 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005774 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005776 int kind;
5777 void *data;
5778 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
Ezio Melottie7f90372012-10-05 03:33:31 +03005780 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005781 escape.
5782
Ezio Melottie7f90372012-10-05 03:33:31 +03005783 For UCS1 strings it's '\xxx', 4 bytes per source character.
5784 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5785 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005786 */
5787
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005788 if (!PyUnicode_Check(unicode)) {
5789 PyErr_BadArgument();
5790 return NULL;
5791 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005792 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005793 return NULL;
5794 len = PyUnicode_GET_LENGTH(unicode);
5795 kind = PyUnicode_KIND(unicode);
5796 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005797 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005798 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5799 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5800 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5801 }
5802
5803 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005804 return PyBytes_FromStringAndSize(NULL, 0);
5805
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005806 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005808
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005809 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005811 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 if (repr == NULL)
5814 return NULL;
5815
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005818 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005819 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005820
Walter Dörwald79e913e2007-05-12 11:08:06 +00005821 /* Escape backslashes */
5822 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 *p++ = '\\';
5824 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005825 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005826 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005827
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005828 /* Map 21-bit characters to '\U00xxxxxx' */
5829 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005830 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005831 *p++ = '\\';
5832 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005833 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5834 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5835 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5836 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5837 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5838 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5839 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5840 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005842 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005843
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005845 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 *p++ = '\\';
5847 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005848 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5849 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5850 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5851 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005853
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005854 /* Map special whitespace to '\t', \n', '\r' */
5855 else if (ch == '\t') {
5856 *p++ = '\\';
5857 *p++ = 't';
5858 }
5859 else if (ch == '\n') {
5860 *p++ = '\\';
5861 *p++ = 'n';
5862 }
5863 else if (ch == '\r') {
5864 *p++ = '\\';
5865 *p++ = 'r';
5866 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005867
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005868 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005869 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005871 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005872 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5873 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005874 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005875
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 /* Copy everything else as-is */
5877 else
5878 *p++ = (char) ch;
5879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005881 assert(p - PyBytes_AS_STRING(repr) > 0);
5882 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5883 return NULL;
5884 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885}
5886
Alexander Belopolsky40018472011-02-26 01:02:56 +00005887PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005888PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5889 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 PyObject *result;
5892 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5893 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 result = PyUnicode_AsUnicodeEscapeString(tmp);
5896 Py_DECREF(tmp);
5897 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898}
5899
5900/* --- Raw Unicode Escape Codec ------------------------------------------- */
5901
Alexander Belopolsky40018472011-02-26 01:02:56 +00005902PyObject *
5903PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005904 Py_ssize_t size,
5905 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005908 Py_ssize_t startinpos;
5909 Py_ssize_t endinpos;
5910 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005911 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 const char *end;
5913 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 PyObject *errorHandler = NULL;
5915 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 length after conversion to the true value. (But decoding error
5920 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005921 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005925 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005926 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 end = s + size;
5928 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 unsigned char c;
5930 Py_UCS4 x;
5931 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005932 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 /* Non-escape characters are interpreted as Unicode ordinals */
5935 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005936 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5937 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005939 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 startinpos = s-starts;
5941
5942 /* \u-escapes are only interpreted iff the number of leading
5943 backslashes if odd */
5944 bs = s;
5945 for (;s < end;) {
5946 if (*s != '\\')
5947 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005948 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5949 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 }
5951 if (((s - bs) & 1) == 0 ||
5952 s >= end ||
5953 (*s != 'u' && *s != 'U')) {
5954 continue;
5955 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005956 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 count = *s=='u' ? 4 : 8;
5958 s++;
5959
5960 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 for (x = 0, i = 0; i < count; ++i, ++s) {
5962 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005963 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 endinpos = s-starts;
5965 if (unicode_decode_call_errorhandler(
5966 errors, &errorHandler,
5967 "rawunicodeescape", "truncated \\uXXXX",
5968 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005969 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 goto onError;
5971 goto nextByte;
5972 }
5973 x = (x<<4) & ~0xF;
5974 if (c >= '0' && c <= '9')
5975 x += c - '0';
5976 else if (c >= 'a' && c <= 'f')
5977 x += 10 + c - 'a';
5978 else
5979 x += 10 + c - 'A';
5980 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005981 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005982 if (unicode_putchar(&v, &outpos, x) < 0)
5983 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005984 } else {
5985 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005986 if (unicode_decode_call_errorhandler(
5987 errors, &errorHandler,
5988 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005990 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005992 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 nextByte:
5994 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005996 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 Py_XDECREF(errorHandler);
5999 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006000 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006001
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 Py_XDECREF(errorHandler);
6005 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 return NULL;
6007}
6008
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009
Alexander Belopolsky40018472011-02-26 01:02:56 +00006010PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006013 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 char *p;
6015 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016 Py_ssize_t expandsize, pos;
6017 int kind;
6018 void *data;
6019 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006021 if (!PyUnicode_Check(unicode)) {
6022 PyErr_BadArgument();
6023 return NULL;
6024 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006025 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 return NULL;
6027 kind = PyUnicode_KIND(unicode);
6028 data = PyUnicode_DATA(unicode);
6029 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006030 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6031 bytes, and 1 byte characters 4. */
6032 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006033
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006036
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 if (repr == NULL)
6039 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006041 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006043 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 for (pos = 0; pos < len; pos++) {
6045 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* Map 32-bit characters to '\Uxxxxxxxx' */
6047 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006048 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006049 *p++ = '\\';
6050 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006051 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6052 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6053 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6054 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6055 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6056 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6057 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6058 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006061 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 *p++ = '\\';
6063 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006064 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6065 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6066 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6067 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 /* Copy everything else as-is */
6070 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 *p++ = (char) ch;
6072 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006073
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006074 assert(p > q);
6075 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006076 return NULL;
6077 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078}
6079
Alexander Belopolsky40018472011-02-26 01:02:56 +00006080PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006081PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6082 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006084 PyObject *result;
6085 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6086 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006087 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006088 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6089 Py_DECREF(tmp);
6090 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091}
6092
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006093/* --- Unicode Internal Codec ------------------------------------------- */
6094
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095PyObject *
6096_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006097 Py_ssize_t size,
6098 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006099{
6100 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t startinpos;
6102 Py_ssize_t endinpos;
6103 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006104 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 const char *end;
6106 const char *reason;
6107 PyObject *errorHandler = NULL;
6108 PyObject *exc = NULL;
6109
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006110 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006111 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006112 1))
6113 return NULL;
6114
Thomas Wouters89f507f2006-12-13 04:49:30 +00006115 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006116 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006117 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006119 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006120 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006121 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006122 end = s + size;
6123
6124 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006125 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006126 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006127 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006128 endinpos = end-starts;
6129 reason = "truncated input";
6130 goto error;
6131 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006132 /* We copy the raw representation one byte at a time because the
6133 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006134 ((char *) &uch)[0] = s[0];
6135 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006136#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006137 ((char *) &uch)[2] = s[2];
6138 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006139#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006140 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006141#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006142 /* We have to sanity check the raw data, otherwise doom looms for
6143 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006144 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006145 endinpos = s - starts + Py_UNICODE_SIZE;
6146 reason = "illegal code point (> 0x10FFFF)";
6147 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006148 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006149#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006150 s += Py_UNICODE_SIZE;
6151#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006152 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006153 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006154 Py_UNICODE uch2;
6155 ((char *) &uch2)[0] = s[0];
6156 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006157 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006158 {
Victor Stinner551ac952011-11-29 22:58:13 +01006159 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006160 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006161 }
6162 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006163#endif
6164
6165 if (unicode_putchar(&v, &outpos, ch) < 0)
6166 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006167 continue;
6168
6169 error:
6170 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006171 if (unicode_decode_call_errorhandler(
6172 errors, &errorHandler,
6173 "unicode_internal", reason,
6174 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006175 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006176 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006177 }
6178
Victor Stinner16e6a802011-12-12 13:24:15 +01006179 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006180 goto onError;
6181 Py_XDECREF(errorHandler);
6182 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006183 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006184
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006186 Py_XDECREF(v);
6187 Py_XDECREF(errorHandler);
6188 Py_XDECREF(exc);
6189 return NULL;
6190}
6191
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192/* --- Latin-1 Codec ------------------------------------------------------ */
6193
Alexander Belopolsky40018472011-02-26 01:02:56 +00006194PyObject *
6195PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006196 Py_ssize_t size,
6197 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006200 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201}
6202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006204static void
6205make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006206 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006207 PyObject *unicode,
6208 Py_ssize_t startpos, Py_ssize_t endpos,
6209 const char *reason)
6210{
6211 if (*exceptionObject == NULL) {
6212 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006213 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006214 encoding, unicode, startpos, endpos, reason);
6215 }
6216 else {
6217 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6218 goto onError;
6219 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6220 goto onError;
6221 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6222 goto onError;
6223 return;
6224 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006225 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006226 }
6227}
6228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006230static void
6231raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006232 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006233 PyObject *unicode,
6234 Py_ssize_t startpos, Py_ssize_t endpos,
6235 const char *reason)
6236{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006237 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006238 encoding, unicode, startpos, endpos, reason);
6239 if (*exceptionObject != NULL)
6240 PyCodec_StrictErrors(*exceptionObject);
6241}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006242
6243/* error handling callback helper:
6244 build arguments, call the callback and check the arguments,
6245 put the result into newpos and return the replacement string, which
6246 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006247static PyObject *
6248unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006249 PyObject **errorHandler,
6250 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006252 Py_ssize_t startpos, Py_ssize_t endpos,
6253 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006255 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006256 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006257 PyObject *restuple;
6258 PyObject *resunicode;
6259
6260 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006262 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 }
6265
Benjamin Petersonbac79492012-01-14 13:34:47 -05006266 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006267 return NULL;
6268 len = PyUnicode_GET_LENGTH(unicode);
6269
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006270 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006271 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274
6275 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006280 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 Py_DECREF(restuple);
6282 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006284 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 &resunicode, newpos)) {
6286 Py_DECREF(restuple);
6287 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006288 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006289 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6290 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6291 Py_DECREF(restuple);
6292 return NULL;
6293 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 *newpos = len + *newpos;
6296 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6298 Py_DECREF(restuple);
6299 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006300 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 Py_INCREF(resunicode);
6302 Py_DECREF(restuple);
6303 return resunicode;
6304}
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006307unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006308 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006309 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006311 /* input state */
6312 Py_ssize_t pos=0, size;
6313 int kind;
6314 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315 /* output object */
6316 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006317 /* pointer into the output */
6318 char *str;
6319 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006320 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006321 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6322 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 PyObject *errorHandler = NULL;
6324 PyObject *exc = NULL;
6325 /* the following variable is used for caching string comparisons
6326 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6327 int known_errorHandler = -1;
6328
Benjamin Petersonbac79492012-01-14 13:34:47 -05006329 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 return NULL;
6331 size = PyUnicode_GET_LENGTH(unicode);
6332 kind = PyUnicode_KIND(unicode);
6333 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 /* allocate enough for a simple encoding without
6335 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006336 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006337 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006338 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006340 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006341 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 ressize = size;
6343
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 while (pos < size) {
6345 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 /* can we encode this? */
6348 if (c<limit) {
6349 /* no overflow check, because we know that the space is enough */
6350 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006352 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 Py_ssize_t requiredsize;
6355 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006356 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 Py_ssize_t collstart = pos;
6359 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 ++collend;
6363 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6364 if (known_errorHandler==-1) {
6365 if ((errors==NULL) || (!strcmp(errors, "strict")))
6366 known_errorHandler = 1;
6367 else if (!strcmp(errors, "replace"))
6368 known_errorHandler = 2;
6369 else if (!strcmp(errors, "ignore"))
6370 known_errorHandler = 3;
6371 else if (!strcmp(errors, "xmlcharrefreplace"))
6372 known_errorHandler = 4;
6373 else
6374 known_errorHandler = 0;
6375 }
6376 switch (known_errorHandler) {
6377 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006378 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 goto onError;
6380 case 2: /* replace */
6381 while (collstart++<collend)
6382 *str++ = '?'; /* fall through */
6383 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 break;
6386 case 4: /* xmlcharrefreplace */
6387 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 /* determine replacement size */
6389 for (i = collstart, repsize = 0; i < collend; ++i) {
6390 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6391 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006395 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006397 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006399 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006403 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006404 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006406 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 if (requiredsize > ressize) {
6410 if (requiredsize<2*ressize)
6411 requiredsize = 2*ressize;
6412 if (_PyBytes_Resize(&res, requiredsize))
6413 goto onError;
6414 str = PyBytes_AS_STRING(res) + respos;
6415 ressize = requiredsize;
6416 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006417 /* generate replacement */
6418 for (i = collstart; i < collend; ++i) {
6419 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006421 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 break;
6423 default:
6424 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006425 encoding, reason, unicode, &exc,
6426 collstart, collend, &newpos);
6427 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006428 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006430 if (PyBytes_Check(repunicode)) {
6431 /* Directly copy bytes result to output. */
6432 repsize = PyBytes_Size(repunicode);
6433 if (repsize > 1) {
6434 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006435 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006436 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6437 Py_DECREF(repunicode);
6438 goto onError;
6439 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006440 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006441 ressize += repsize-1;
6442 }
6443 memcpy(str, PyBytes_AsString(repunicode), repsize);
6444 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006445 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006446 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006447 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006448 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 /* need more space? (at least enough for what we
6450 have+the replacement+the rest of the string, so
6451 we won't have to check space for encodable characters) */
6452 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 repsize = PyUnicode_GET_LENGTH(repunicode);
6454 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 if (requiredsize > ressize) {
6456 if (requiredsize<2*ressize)
6457 requiredsize = 2*ressize;
6458 if (_PyBytes_Resize(&res, requiredsize)) {
6459 Py_DECREF(repunicode);
6460 goto onError;
6461 }
6462 str = PyBytes_AS_STRING(res) + respos;
6463 ressize = requiredsize;
6464 }
6465 /* check if there is anything unencodable in the replacement
6466 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 for (i = 0; repsize-->0; ++i, ++str) {
6468 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006470 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006471 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 Py_DECREF(repunicode);
6473 goto onError;
6474 }
6475 *str = (char)c;
6476 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006478 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006480 }
6481 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006482 /* Resize if we allocated to much */
6483 size = str - PyBytes_AS_STRING(res);
6484 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006485 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006486 if (_PyBytes_Resize(&res, size) < 0)
6487 goto onError;
6488 }
6489
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 Py_XDECREF(errorHandler);
6491 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006492 return res;
6493
6494 onError:
6495 Py_XDECREF(res);
6496 Py_XDECREF(errorHandler);
6497 Py_XDECREF(exc);
6498 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499}
6500
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502PyObject *
6503PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006504 Py_ssize_t size,
6505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 PyObject *result;
6508 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6509 if (unicode == NULL)
6510 return NULL;
6511 result = unicode_encode_ucs1(unicode, errors, 256);
6512 Py_DECREF(unicode);
6513 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514}
6515
Alexander Belopolsky40018472011-02-26 01:02:56 +00006516PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518{
6519 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 PyErr_BadArgument();
6521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006523 if (PyUnicode_READY(unicode) == -1)
6524 return NULL;
6525 /* Fast path: if it is a one-byte string, construct
6526 bytes object directly. */
6527 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6528 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6529 PyUnicode_GET_LENGTH(unicode));
6530 /* Non-Latin-1 characters present. Defer to above function to
6531 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006533}
6534
6535PyObject*
6536PyUnicode_AsLatin1String(PyObject *unicode)
6537{
6538 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
6541/* --- 7-bit ASCII Codec -------------------------------------------------- */
6542
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543PyObject *
6544PyUnicode_DecodeASCII(const char *s,
6545 Py_ssize_t size,
6546 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006548 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006549 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006550 int kind;
6551 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006552 Py_ssize_t startinpos;
6553 Py_ssize_t endinpos;
6554 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555 const char *e;
6556 PyObject *errorHandler = NULL;
6557 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006558
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006560 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006563 if (size == 1 && (unsigned char)s[0] < 128)
6564 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006565
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006566 unicode = PyUnicode_New(size, 127);
6567 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006570 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006571 data = PyUnicode_1BYTE_DATA(unicode);
6572 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6573 if (outpos == size)
6574 return unicode;
6575
6576 s += outpos;
6577 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 register unsigned char c = (unsigned char)*s;
6580 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006581 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 ++s;
6583 }
6584 else {
6585 startinpos = s-starts;
6586 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 if (unicode_decode_call_errorhandler(
6588 errors, &errorHandler,
6589 "ascii", "ordinal not in range(128)",
6590 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006591 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006593 kind = PyUnicode_KIND(unicode);
6594 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006597 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006598 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 Py_XDECREF(errorHandler);
6600 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006601 assert(_PyUnicode_CheckConsistency(unicode, 1));
6602 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006603
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006605 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 Py_XDECREF(errorHandler);
6607 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 return NULL;
6609}
6610
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006611/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612PyObject *
6613PyUnicode_EncodeASCII(const Py_UNICODE *p,
6614 Py_ssize_t size,
6615 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 PyObject *result;
6618 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6619 if (unicode == NULL)
6620 return NULL;
6621 result = unicode_encode_ucs1(unicode, errors, 128);
6622 Py_DECREF(unicode);
6623 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006627_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 PyErr_BadArgument();
6631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006633 if (PyUnicode_READY(unicode) == -1)
6634 return NULL;
6635 /* Fast path: if it is an ASCII-only string, construct bytes object
6636 directly. Else defer to above function to raise the exception. */
6637 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6638 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6639 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006640 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006641}
6642
6643PyObject *
6644PyUnicode_AsASCIIString(PyObject *unicode)
6645{
6646 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
Victor Stinner99b95382011-07-04 14:23:54 +02006649#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006650
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006651/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006652
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006653#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006654#define NEED_RETRY
6655#endif
6656
Victor Stinner3a50e702011-10-18 21:21:00 +02006657#ifndef WC_ERR_INVALID_CHARS
6658# define WC_ERR_INVALID_CHARS 0x0080
6659#endif
6660
6661static char*
6662code_page_name(UINT code_page, PyObject **obj)
6663{
6664 *obj = NULL;
6665 if (code_page == CP_ACP)
6666 return "mbcs";
6667 if (code_page == CP_UTF7)
6668 return "CP_UTF7";
6669 if (code_page == CP_UTF8)
6670 return "CP_UTF8";
6671
6672 *obj = PyBytes_FromFormat("cp%u", code_page);
6673 if (*obj == NULL)
6674 return NULL;
6675 return PyBytes_AS_STRING(*obj);
6676}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677
Alexander Belopolsky40018472011-02-26 01:02:56 +00006678static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006679is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006680{
6681 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006682 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683
Victor Stinner3a50e702011-10-18 21:21:00 +02006684 if (!IsDBCSLeadByteEx(code_page, *curr))
6685 return 0;
6686
6687 prev = CharPrevExA(code_page, s, curr, 0);
6688 if (prev == curr)
6689 return 1;
6690 /* FIXME: This code is limited to "true" double-byte encodings,
6691 as it assumes an incomplete character consists of a single
6692 byte. */
6693 if (curr - prev == 2)
6694 return 1;
6695 if (!IsDBCSLeadByteEx(code_page, *prev))
6696 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697 return 0;
6698}
6699
Victor Stinner3a50e702011-10-18 21:21:00 +02006700static DWORD
6701decode_code_page_flags(UINT code_page)
6702{
6703 if (code_page == CP_UTF7) {
6704 /* The CP_UTF7 decoder only supports flags=0 */
6705 return 0;
6706 }
6707 else
6708 return MB_ERR_INVALID_CHARS;
6709}
6710
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006712 * Decode a byte string from a Windows code page into unicode object in strict
6713 * mode.
6714 *
6715 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6716 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006718static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006719decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006720 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006721 const char *in,
6722 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006723{
Victor Stinner3a50e702011-10-18 21:21:00 +02006724 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006725 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006726 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727
6728 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006729 assert(insize > 0);
6730 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6731 if (outsize <= 0)
6732 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006733
6734 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006736 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006737 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 if (*v == NULL)
6739 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006740 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006741 }
6742 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006744 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006745 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006747 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006748 }
6749
6750 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006751 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6752 if (outsize <= 0)
6753 goto error;
6754 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006755
Victor Stinner3a50e702011-10-18 21:21:00 +02006756error:
6757 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6758 return -2;
6759 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006760 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761}
6762
Victor Stinner3a50e702011-10-18 21:21:00 +02006763/*
6764 * Decode a byte string from a code page into unicode object with an error
6765 * handler.
6766 *
6767 * Returns consumed size if succeed, or raise a WindowsError or
6768 * UnicodeDecodeError exception and returns -1 on error.
6769 */
6770static int
6771decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006772 PyObject **v,
6773 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006774 const char *errors)
6775{
6776 const char *startin = in;
6777 const char *endin = in + size;
6778 const DWORD flags = decode_code_page_flags(code_page);
6779 /* Ideally, we should get reason from FormatMessage. This is the Windows
6780 2000 English version of the message. */
6781 const char *reason = "No mapping for the Unicode character exists "
6782 "in the target code page.";
6783 /* each step cannot decode more than 1 character, but a character can be
6784 represented as a surrogate pair */
6785 wchar_t buffer[2], *startout, *out;
6786 int insize, outsize;
6787 PyObject *errorHandler = NULL;
6788 PyObject *exc = NULL;
6789 PyObject *encoding_obj = NULL;
6790 char *encoding;
6791 DWORD err;
6792 int ret = -1;
6793
6794 assert(size > 0);
6795
6796 encoding = code_page_name(code_page, &encoding_obj);
6797 if (encoding == NULL)
6798 return -1;
6799
6800 if (errors == NULL || strcmp(errors, "strict") == 0) {
6801 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6802 UnicodeDecodeError. */
6803 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6804 if (exc != NULL) {
6805 PyCodec_StrictErrors(exc);
6806 Py_CLEAR(exc);
6807 }
6808 goto error;
6809 }
6810
6811 if (*v == NULL) {
6812 /* Create unicode object */
6813 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6814 PyErr_NoMemory();
6815 goto error;
6816 }
Victor Stinnerab595942011-12-17 04:59:06 +01006817 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006818 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006819 if (*v == NULL)
6820 goto error;
6821 startout = PyUnicode_AS_UNICODE(*v);
6822 }
6823 else {
6824 /* Extend unicode object */
6825 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6826 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6827 PyErr_NoMemory();
6828 goto error;
6829 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006830 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006831 goto error;
6832 startout = PyUnicode_AS_UNICODE(*v) + n;
6833 }
6834
6835 /* Decode the byte string character per character */
6836 out = startout;
6837 while (in < endin)
6838 {
6839 /* Decode a character */
6840 insize = 1;
6841 do
6842 {
6843 outsize = MultiByteToWideChar(code_page, flags,
6844 in, insize,
6845 buffer, Py_ARRAY_LENGTH(buffer));
6846 if (outsize > 0)
6847 break;
6848 err = GetLastError();
6849 if (err != ERROR_NO_UNICODE_TRANSLATION
6850 && err != ERROR_INSUFFICIENT_BUFFER)
6851 {
6852 PyErr_SetFromWindowsErr(0);
6853 goto error;
6854 }
6855 insize++;
6856 }
6857 /* 4=maximum length of a UTF-8 sequence */
6858 while (insize <= 4 && (in + insize) <= endin);
6859
6860 if (outsize <= 0) {
6861 Py_ssize_t startinpos, endinpos, outpos;
6862
6863 startinpos = in - startin;
6864 endinpos = startinpos + 1;
6865 outpos = out - PyUnicode_AS_UNICODE(*v);
6866 if (unicode_decode_call_errorhandler(
6867 errors, &errorHandler,
6868 encoding, reason,
6869 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006870 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 {
6872 goto error;
6873 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006874 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006875 }
6876 else {
6877 in += insize;
6878 memcpy(out, buffer, outsize * sizeof(wchar_t));
6879 out += outsize;
6880 }
6881 }
6882
6883 /* write a NUL character at the end */
6884 *out = 0;
6885
6886 /* Extend unicode object */
6887 outsize = out - startout;
6888 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006889 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006891 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006892
6893error:
6894 Py_XDECREF(encoding_obj);
6895 Py_XDECREF(errorHandler);
6896 Py_XDECREF(exc);
6897 return ret;
6898}
6899
Victor Stinner3a50e702011-10-18 21:21:00 +02006900static PyObject *
6901decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006902 const char *s, Py_ssize_t size,
6903 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904{
Victor Stinner76a31a62011-11-04 00:05:13 +01006905 PyObject *v = NULL;
6906 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 if (code_page < 0) {
6909 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6910 return NULL;
6911 }
6912
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 do
6917 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006919 if (size > INT_MAX) {
6920 chunk_size = INT_MAX;
6921 final = 0;
6922 done = 0;
6923 }
6924 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006926 {
6927 chunk_size = (int)size;
6928 final = (consumed == NULL);
6929 done = 1;
6930 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931
Victor Stinner76a31a62011-11-04 00:05:13 +01006932 /* Skip trailing lead-byte unless 'final' is set */
6933 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6934 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006935
Victor Stinner76a31a62011-11-04 00:05:13 +01006936 if (chunk_size == 0 && done) {
6937 if (v != NULL)
6938 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006939 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006940 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941
Victor Stinner76a31a62011-11-04 00:05:13 +01006942
6943 converted = decode_code_page_strict(code_page, &v,
6944 s, chunk_size);
6945 if (converted == -2)
6946 converted = decode_code_page_errors(code_page, &v,
6947 s, chunk_size,
6948 errors);
6949 assert(converted != 0);
6950
6951 if (converted < 0) {
6952 Py_XDECREF(v);
6953 return NULL;
6954 }
6955
6956 if (consumed)
6957 *consumed += converted;
6958
6959 s += converted;
6960 size -= converted;
6961 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006962
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006963 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964}
6965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006967PyUnicode_DecodeCodePageStateful(int code_page,
6968 const char *s,
6969 Py_ssize_t size,
6970 const char *errors,
6971 Py_ssize_t *consumed)
6972{
6973 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6974}
6975
6976PyObject *
6977PyUnicode_DecodeMBCSStateful(const char *s,
6978 Py_ssize_t size,
6979 const char *errors,
6980 Py_ssize_t *consumed)
6981{
6982 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6983}
6984
6985PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006986PyUnicode_DecodeMBCS(const char *s,
6987 Py_ssize_t size,
6988 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006989{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006990 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6991}
6992
Victor Stinner3a50e702011-10-18 21:21:00 +02006993static DWORD
6994encode_code_page_flags(UINT code_page, const char *errors)
6995{
6996 if (code_page == CP_UTF8) {
6997 if (winver.dwMajorVersion >= 6)
6998 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6999 and later */
7000 return WC_ERR_INVALID_CHARS;
7001 else
7002 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7003 return 0;
7004 }
7005 else if (code_page == CP_UTF7) {
7006 /* CP_UTF7 only supports flags=0 */
7007 return 0;
7008 }
7009 else {
7010 if (errors != NULL && strcmp(errors, "replace") == 0)
7011 return 0;
7012 else
7013 return WC_NO_BEST_FIT_CHARS;
7014 }
7015}
7016
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 * Encode a Unicode string to a Windows code page into a byte string in strict
7019 * mode.
7020 *
7021 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7022 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007025encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007026 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028{
Victor Stinner554f3f02010-06-16 23:33:54 +00007029 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007030 BOOL *pusedDefaultChar = &usedDefaultChar;
7031 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007032 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007033 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007034 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007035 const DWORD flags = encode_code_page_flags(code_page, NULL);
7036 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007037 /* Create a substring so that we can get the UTF-16 representation
7038 of just the slice under consideration. */
7039 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040
Martin v. Löwis3d325192011-11-04 18:23:06 +01007041 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007042
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007044 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007046 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007047
Victor Stinner2fc507f2011-11-04 20:06:39 +01007048 substring = PyUnicode_Substring(unicode, offset, offset+len);
7049 if (substring == NULL)
7050 return -1;
7051 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7052 if (p == NULL) {
7053 Py_DECREF(substring);
7054 return -1;
7055 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007056
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007057 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 outsize = WideCharToMultiByte(code_page, flags,
7059 p, size,
7060 NULL, 0,
7061 NULL, pusedDefaultChar);
7062 if (outsize <= 0)
7063 goto error;
7064 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007065 if (pusedDefaultChar && *pusedDefaultChar) {
7066 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007068 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007069
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007073 if (*outbytes == NULL) {
7074 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007076 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078 }
7079 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 const Py_ssize_t n = PyBytes_Size(*outbytes);
7082 if (outsize > PY_SSIZE_T_MAX - n) {
7083 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007084 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007087 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7088 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007090 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092 }
7093
7094 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007095 outsize = WideCharToMultiByte(code_page, flags,
7096 p, size,
7097 out, outsize,
7098 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007099 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 if (outsize <= 0)
7101 goto error;
7102 if (pusedDefaultChar && *pusedDefaultChar)
7103 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007105
Victor Stinner3a50e702011-10-18 21:21:00 +02007106error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007107 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7109 return -2;
7110 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007111 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007112}
7113
Victor Stinner3a50e702011-10-18 21:21:00 +02007114/*
7115 * Encode a Unicode string to a Windows code page into a byte string using a
7116 * error handler.
7117 *
7118 * Returns consumed characters if succeed, or raise a WindowsError and returns
7119 * -1 on other error.
7120 */
7121static int
7122encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007123 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007124 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007125{
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007127 Py_ssize_t pos = unicode_offset;
7128 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 /* Ideally, we should get reason from FormatMessage. This is the Windows
7130 2000 English version of the message. */
7131 const char *reason = "invalid character";
7132 /* 4=maximum length of a UTF-8 sequence */
7133 char buffer[4];
7134 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7135 Py_ssize_t outsize;
7136 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 PyObject *errorHandler = NULL;
7138 PyObject *exc = NULL;
7139 PyObject *encoding_obj = NULL;
7140 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007141 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 PyObject *rep;
7143 int ret = -1;
7144
7145 assert(insize > 0);
7146
7147 encoding = code_page_name(code_page, &encoding_obj);
7148 if (encoding == NULL)
7149 return -1;
7150
7151 if (errors == NULL || strcmp(errors, "strict") == 0) {
7152 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7153 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007154 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 if (exc != NULL) {
7156 PyCodec_StrictErrors(exc);
7157 Py_DECREF(exc);
7158 }
7159 Py_XDECREF(encoding_obj);
7160 return -1;
7161 }
7162
7163 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7164 pusedDefaultChar = &usedDefaultChar;
7165 else
7166 pusedDefaultChar = NULL;
7167
7168 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7169 PyErr_NoMemory();
7170 goto error;
7171 }
7172 outsize = insize * Py_ARRAY_LENGTH(buffer);
7173
7174 if (*outbytes == NULL) {
7175 /* Create string object */
7176 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7177 if (*outbytes == NULL)
7178 goto error;
7179 out = PyBytes_AS_STRING(*outbytes);
7180 }
7181 else {
7182 /* Extend string object */
7183 Py_ssize_t n = PyBytes_Size(*outbytes);
7184 if (n > PY_SSIZE_T_MAX - outsize) {
7185 PyErr_NoMemory();
7186 goto error;
7187 }
7188 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7189 goto error;
7190 out = PyBytes_AS_STRING(*outbytes) + n;
7191 }
7192
7193 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007194 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7197 wchar_t chars[2];
7198 int charsize;
7199 if (ch < 0x10000) {
7200 chars[0] = (wchar_t)ch;
7201 charsize = 1;
7202 }
7203 else {
7204 ch -= 0x10000;
7205 chars[0] = 0xd800 + (ch >> 10);
7206 chars[1] = 0xdc00 + (ch & 0x3ff);
7207 charsize = 2;
7208 }
7209
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007211 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 buffer, Py_ARRAY_LENGTH(buffer),
7213 NULL, pusedDefaultChar);
7214 if (outsize > 0) {
7215 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7216 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007217 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 memcpy(out, buffer, outsize);
7219 out += outsize;
7220 continue;
7221 }
7222 }
7223 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7224 PyErr_SetFromWindowsErr(0);
7225 goto error;
7226 }
7227
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 rep = unicode_encode_call_errorhandler(
7229 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007230 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007231 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 if (rep == NULL)
7233 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007234 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007235
7236 if (PyBytes_Check(rep)) {
7237 outsize = PyBytes_GET_SIZE(rep);
7238 if (outsize != 1) {
7239 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7240 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7241 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7242 Py_DECREF(rep);
7243 goto error;
7244 }
7245 out = PyBytes_AS_STRING(*outbytes) + offset;
7246 }
7247 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7248 out += outsize;
7249 }
7250 else {
7251 Py_ssize_t i;
7252 enum PyUnicode_Kind kind;
7253 void *data;
7254
Benjamin Petersonbac79492012-01-14 13:34:47 -05007255 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 Py_DECREF(rep);
7257 goto error;
7258 }
7259
7260 outsize = PyUnicode_GET_LENGTH(rep);
7261 if (outsize != 1) {
7262 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7263 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7264 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7265 Py_DECREF(rep);
7266 goto error;
7267 }
7268 out = PyBytes_AS_STRING(*outbytes) + offset;
7269 }
7270 kind = PyUnicode_KIND(rep);
7271 data = PyUnicode_DATA(rep);
7272 for (i=0; i < outsize; i++) {
7273 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7274 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007275 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007276 encoding, unicode,
7277 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 "unable to encode error handler result to ASCII");
7279 Py_DECREF(rep);
7280 goto error;
7281 }
7282 *out = (unsigned char)ch;
7283 out++;
7284 }
7285 }
7286 Py_DECREF(rep);
7287 }
7288 /* write a NUL byte */
7289 *out = 0;
7290 outsize = out - PyBytes_AS_STRING(*outbytes);
7291 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7292 if (_PyBytes_Resize(outbytes, outsize) < 0)
7293 goto error;
7294 ret = 0;
7295
7296error:
7297 Py_XDECREF(encoding_obj);
7298 Py_XDECREF(errorHandler);
7299 Py_XDECREF(exc);
7300 return ret;
7301}
7302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303static PyObject *
7304encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007305 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 const char *errors)
7307{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007308 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007310 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007311 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007312
Benjamin Petersonbac79492012-01-14 13:34:47 -05007313 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007314 return NULL;
7315 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007316
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 if (code_page < 0) {
7318 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7319 return NULL;
7320 }
7321
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007323 return PyBytes_FromStringAndSize(NULL, 0);
7324
Victor Stinner7581cef2011-11-03 22:32:33 +01007325 offset = 0;
7326 do
7327 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007328#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007329 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007330 chunks. */
7331 if (len > INT_MAX/2) {
7332 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 done = 0;
7334 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007335 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 done = 1;
7340 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007341
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007343 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 errors);
7345 if (ret == -2)
7346 ret = encode_code_page_errors(code_page, &outbytes,
7347 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007348 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007349 if (ret < 0) {
7350 Py_XDECREF(outbytes);
7351 return NULL;
7352 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353
Victor Stinner7581cef2011-11-03 22:32:33 +01007354 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007355 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007356 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 return outbytes;
7359}
7360
7361PyObject *
7362PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7363 Py_ssize_t size,
7364 const char *errors)
7365{
Victor Stinner7581cef2011-11-03 22:32:33 +01007366 PyObject *unicode, *res;
7367 unicode = PyUnicode_FromUnicode(p, size);
7368 if (unicode == NULL)
7369 return NULL;
7370 res = encode_code_page(CP_ACP, unicode, errors);
7371 Py_DECREF(unicode);
7372 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007373}
7374
7375PyObject *
7376PyUnicode_EncodeCodePage(int code_page,
7377 PyObject *unicode,
7378 const char *errors)
7379{
Victor Stinner7581cef2011-11-03 22:32:33 +01007380 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007381}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007382
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383PyObject *
7384PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007385{
7386 if (!PyUnicode_Check(unicode)) {
7387 PyErr_BadArgument();
7388 return NULL;
7389 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007390 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007391}
7392
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007393#undef NEED_RETRY
7394
Victor Stinner99b95382011-07-04 14:23:54 +02007395#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007396
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397/* --- Character Mapping Codec -------------------------------------------- */
7398
Alexander Belopolsky40018472011-02-26 01:02:56 +00007399PyObject *
7400PyUnicode_DecodeCharmap(const char *s,
7401 Py_ssize_t size,
7402 PyObject *mapping,
7403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007405 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007406 Py_ssize_t startinpos;
7407 Py_ssize_t endinpos;
7408 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007409 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007410 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007411 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007412 PyObject *errorHandler = NULL;
7413 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007414
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415 /* Default to Latin-1 */
7416 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007419 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007423 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007424 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007425 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007426 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007427 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007428 enum PyUnicode_Kind mapkind;
7429 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007430 Py_UCS4 x;
7431
Benjamin Petersonbac79492012-01-14 13:34:47 -05007432 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007433 return NULL;
7434
7435 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007436 mapdata = PyUnicode_DATA(mapping);
7437 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007438 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007439 unsigned char ch;
7440 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7441 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7442 if (outkind == PyUnicode_1BYTE_KIND) {
7443 void *outdata = PyUnicode_DATA(v);
7444 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7445 while (s < e) {
7446 unsigned char ch = *s;
7447 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7448 if (x > maxchar)
7449 goto Error;
7450 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7451 ++s;
7452 }
7453 break;
7454 }
7455 else if (outkind == PyUnicode_2BYTE_KIND) {
7456 void *outdata = PyUnicode_DATA(v);
7457 while (s < e) {
7458 unsigned char ch = *s;
7459 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7460 if (x == 0xFFFE)
7461 goto Error;
7462 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7463 ++s;
7464 }
7465 break;
7466 }
7467 }
7468 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007471 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007472 else
7473 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007474Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007475 if (x == 0xfffe)
7476 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 startinpos = s-starts;
7479 endinpos = startinpos+1;
7480 if (unicode_decode_call_errorhandler(
7481 errors, &errorHandler,
7482 "charmap", "character maps to <undefined>",
7483 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007484 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 goto onError;
7486 }
7487 continue;
7488 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007489
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007490 if (unicode_putchar(&v, &outpos, x) < 0)
7491 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007493 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007494 }
7495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 while (s < e) {
7497 unsigned char ch = *s;
7498 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007499
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7501 w = PyLong_FromLong((long)ch);
7502 if (w == NULL)
7503 goto onError;
7504 x = PyObject_GetItem(mapping, w);
7505 Py_DECREF(w);
7506 if (x == NULL) {
7507 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7508 /* No mapping found means: mapping is undefined. */
7509 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007510 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 } else
7512 goto onError;
7513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007514
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007516 if (x == Py_None)
7517 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 if (PyLong_Check(x)) {
7519 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007520 if (value == 0xFFFE)
7521 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007522 if (value < 0 || value > MAX_UNICODE) {
7523 PyErr_Format(PyExc_TypeError,
7524 "character mapping must be in range(0x%lx)",
7525 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 Py_DECREF(x);
7527 goto onError;
7528 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007529 if (unicode_putchar(&v, &outpos, value) < 0) {
7530 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007531 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007535 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007536
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007537 if (PyUnicode_READY(x) == -1) {
7538 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007539 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007540 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007541 targetsize = PyUnicode_GET_LENGTH(x);
7542
7543 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007545 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007546 if (value == 0xFFFE)
7547 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007548 if (unicode_putchar(&v, &outpos, value) < 0) {
7549 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007550 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007551 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007552 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 else if (targetsize > 1) {
7554 /* 1-n mapping */
7555 if (targetsize > extrachars) {
7556 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 Py_ssize_t needed = (targetsize - extrachars) + \
7558 (targetsize << 2);
7559 extrachars += needed;
7560 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007561 if (unicode_resize(&v,
7562 PyUnicode_GET_LENGTH(v) + needed) < 0)
7563 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 Py_DECREF(x);
7565 goto onError;
7566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007568 if (unicode_widen(&v, outpos,
7569 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7570 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007571 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007572 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007573 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7574 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 extrachars -= targetsize;
7576 }
7577 /* 1-0 mapping: skip the character */
7578 }
7579 else {
7580 /* wrong return value */
7581 PyErr_SetString(PyExc_TypeError,
7582 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007583 Py_DECREF(x);
7584 goto onError;
7585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 Py_DECREF(x);
7587 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007588 continue;
7589Undefined:
7590 /* undefined mapping */
7591 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007592 startinpos = s-starts;
7593 endinpos = startinpos+1;
7594 if (unicode_decode_call_errorhandler(
7595 errors, &errorHandler,
7596 "charmap", "character maps to <undefined>",
7597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007598 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007599 goto onError;
7600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007603 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007604 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007605 Py_XDECREF(errorHandler);
7606 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007607 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007608
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007610 Py_XDECREF(errorHandler);
7611 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 Py_XDECREF(v);
7613 return NULL;
7614}
7615
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616/* Charmap encoding: the lookup table */
7617
Alexander Belopolsky40018472011-02-26 01:02:56 +00007618struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 PyObject_HEAD
7620 unsigned char level1[32];
7621 int count2, count3;
7622 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623};
7624
7625static PyObject*
7626encoding_map_size(PyObject *obj, PyObject* args)
7627{
7628 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007631}
7632
7633static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 PyDoc_STR("Return the size (in bytes) of this object") },
7636 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007637};
7638
7639static void
7640encoding_map_dealloc(PyObject* o)
7641{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007642 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007643}
7644
7645static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007646 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 "EncodingMap", /*tp_name*/
7648 sizeof(struct encoding_map), /*tp_basicsize*/
7649 0, /*tp_itemsize*/
7650 /* methods */
7651 encoding_map_dealloc, /*tp_dealloc*/
7652 0, /*tp_print*/
7653 0, /*tp_getattr*/
7654 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007655 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 0, /*tp_repr*/
7657 0, /*tp_as_number*/
7658 0, /*tp_as_sequence*/
7659 0, /*tp_as_mapping*/
7660 0, /*tp_hash*/
7661 0, /*tp_call*/
7662 0, /*tp_str*/
7663 0, /*tp_getattro*/
7664 0, /*tp_setattro*/
7665 0, /*tp_as_buffer*/
7666 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7667 0, /*tp_doc*/
7668 0, /*tp_traverse*/
7669 0, /*tp_clear*/
7670 0, /*tp_richcompare*/
7671 0, /*tp_weaklistoffset*/
7672 0, /*tp_iter*/
7673 0, /*tp_iternext*/
7674 encoding_map_methods, /*tp_methods*/
7675 0, /*tp_members*/
7676 0, /*tp_getset*/
7677 0, /*tp_base*/
7678 0, /*tp_dict*/
7679 0, /*tp_descr_get*/
7680 0, /*tp_descr_set*/
7681 0, /*tp_dictoffset*/
7682 0, /*tp_init*/
7683 0, /*tp_alloc*/
7684 0, /*tp_new*/
7685 0, /*tp_free*/
7686 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007687};
7688
7689PyObject*
7690PyUnicode_BuildEncodingMap(PyObject* string)
7691{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007692 PyObject *result;
7693 struct encoding_map *mresult;
7694 int i;
7695 int need_dict = 0;
7696 unsigned char level1[32];
7697 unsigned char level2[512];
7698 unsigned char *mlevel1, *mlevel2, *mlevel3;
7699 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007700 int kind;
7701 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007702 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007705 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706 PyErr_BadArgument();
7707 return NULL;
7708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007709 kind = PyUnicode_KIND(string);
7710 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007711 length = PyUnicode_GET_LENGTH(string);
7712 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 memset(level1, 0xFF, sizeof level1);
7714 memset(level2, 0xFF, sizeof level2);
7715
7716 /* If there isn't a one-to-one mapping of NULL to \0,
7717 or if there are non-BMP characters, we need to use
7718 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007719 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007720 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007721 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007722 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007723 ch = PyUnicode_READ(kind, data, i);
7724 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007725 need_dict = 1;
7726 break;
7727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007729 /* unmapped character */
7730 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007731 l1 = ch >> 11;
7732 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007733 if (level1[l1] == 0xFF)
7734 level1[l1] = count2++;
7735 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007736 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007737 }
7738
7739 if (count2 >= 0xFF || count3 >= 0xFF)
7740 need_dict = 1;
7741
7742 if (need_dict) {
7743 PyObject *result = PyDict_New();
7744 PyObject *key, *value;
7745 if (!result)
7746 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007747 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007748 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007749 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007750 if (!key || !value)
7751 goto failed1;
7752 if (PyDict_SetItem(result, key, value) == -1)
7753 goto failed1;
7754 Py_DECREF(key);
7755 Py_DECREF(value);
7756 }
7757 return result;
7758 failed1:
7759 Py_XDECREF(key);
7760 Py_XDECREF(value);
7761 Py_DECREF(result);
7762 return NULL;
7763 }
7764
7765 /* Create a three-level trie */
7766 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7767 16*count2 + 128*count3 - 1);
7768 if (!result)
7769 return PyErr_NoMemory();
7770 PyObject_Init(result, &EncodingMapType);
7771 mresult = (struct encoding_map*)result;
7772 mresult->count2 = count2;
7773 mresult->count3 = count3;
7774 mlevel1 = mresult->level1;
7775 mlevel2 = mresult->level23;
7776 mlevel3 = mresult->level23 + 16*count2;
7777 memcpy(mlevel1, level1, 32);
7778 memset(mlevel2, 0xFF, 16*count2);
7779 memset(mlevel3, 0, 128*count3);
7780 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007781 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007783 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7784 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785 /* unmapped character */
7786 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007787 o1 = ch>>11;
7788 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789 i2 = 16*mlevel1[o1] + o2;
7790 if (mlevel2[i2] == 0xFF)
7791 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007792 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007793 i3 = 128*mlevel2[i2] + o3;
7794 mlevel3[i3] = i;
7795 }
7796 return result;
7797}
7798
7799static int
Victor Stinner22168992011-11-20 17:09:18 +01007800encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801{
7802 struct encoding_map *map = (struct encoding_map*)mapping;
7803 int l1 = c>>11;
7804 int l2 = (c>>7) & 0xF;
7805 int l3 = c & 0x7F;
7806 int i;
7807
Victor Stinner22168992011-11-20 17:09:18 +01007808 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007810 if (c == 0)
7811 return 0;
7812 /* level 1*/
7813 i = map->level1[l1];
7814 if (i == 0xFF) {
7815 return -1;
7816 }
7817 /* level 2*/
7818 i = map->level23[16*i+l2];
7819 if (i == 0xFF) {
7820 return -1;
7821 }
7822 /* level 3 */
7823 i = map->level23[16*map->count2 + 128*i + l3];
7824 if (i == 0) {
7825 return -1;
7826 }
7827 return i;
7828}
7829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830/* Lookup the character ch in the mapping. If the character
7831 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007832 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007833static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007834charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835{
Christian Heimes217cfd12007-12-02 14:31:20 +00007836 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007837 PyObject *x;
7838
7839 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 x = PyObject_GetItem(mapping, w);
7842 Py_DECREF(w);
7843 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7845 /* No mapping found means: mapping is undefined. */
7846 PyErr_Clear();
7847 x = Py_None;
7848 Py_INCREF(x);
7849 return x;
7850 } else
7851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007853 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007855 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 long value = PyLong_AS_LONG(x);
7857 if (value < 0 || value > 255) {
7858 PyErr_SetString(PyExc_TypeError,
7859 "character mapping must be in range(256)");
7860 Py_DECREF(x);
7861 return NULL;
7862 }
7863 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007865 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 /* wrong return value */
7869 PyErr_Format(PyExc_TypeError,
7870 "character mapping must return integer, bytes or None, not %.400s",
7871 x->ob_type->tp_name);
7872 Py_DECREF(x);
7873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 }
7875}
7876
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007877static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007878charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7881 /* exponentially overallocate to minimize reallocations */
7882 if (requiredsize < 2*outsize)
7883 requiredsize = 2*outsize;
7884 if (_PyBytes_Resize(outobj, requiredsize))
7885 return -1;
7886 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887}
7888
Benjamin Peterson14339b62009-01-31 16:36:08 +00007889typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007891} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007892/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007893 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 space is available. Return a new reference to the object that
7895 was put in the output buffer, or Py_None, if the mapping was undefined
7896 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007897 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007898static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007899charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007900 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007901{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007902 PyObject *rep;
7903 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007904 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007905
Christian Heimes90aa7642007-12-19 02:45:37 +00007906 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 if (res == -1)
7910 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 if (outsize<requiredsize)
7912 if (charmapencode_resize(outobj, outpos, requiredsize))
7913 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007914 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 outstart[(*outpos)++] = (char)res;
7916 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917 }
7918
7919 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007920 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 Py_DECREF(rep);
7924 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007925 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 if (PyLong_Check(rep)) {
7927 Py_ssize_t requiredsize = *outpos+1;
7928 if (outsize<requiredsize)
7929 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7930 Py_DECREF(rep);
7931 return enc_EXCEPTION;
7932 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007933 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007935 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 else {
7937 const char *repchars = PyBytes_AS_STRING(rep);
7938 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7939 Py_ssize_t requiredsize = *outpos+repsize;
7940 if (outsize<requiredsize)
7941 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7942 Py_DECREF(rep);
7943 return enc_EXCEPTION;
7944 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007945 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 memcpy(outstart + *outpos, repchars, repsize);
7947 *outpos += repsize;
7948 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007950 Py_DECREF(rep);
7951 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952}
7953
7954/* handle an error in PyUnicode_EncodeCharmap
7955 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007956static int
7957charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007958 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007960 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007961 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962{
7963 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007965 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007966 enum PyUnicode_Kind kind;
7967 void *data;
7968 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007970 Py_ssize_t collstartpos = *inpos;
7971 Py_ssize_t collendpos = *inpos+1;
7972 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 char *encoding = "charmap";
7974 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007975 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007976 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007977 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978
Benjamin Petersonbac79492012-01-14 13:34:47 -05007979 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007980 return -1;
7981 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007982 /* find all unencodable characters */
7983 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007985 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007986 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007987 val = encoding_map_lookup(ch, mapping);
7988 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 break;
7990 ++collendpos;
7991 continue;
7992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007994 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7995 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 if (rep==NULL)
7997 return -1;
7998 else if (rep!=Py_None) {
7999 Py_DECREF(rep);
8000 break;
8001 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008004 }
8005 /* cache callback name lookup
8006 * (if not done yet, i.e. it's the first error) */
8007 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 if ((errors==NULL) || (!strcmp(errors, "strict")))
8009 *known_errorHandler = 1;
8010 else if (!strcmp(errors, "replace"))
8011 *known_errorHandler = 2;
8012 else if (!strcmp(errors, "ignore"))
8013 *known_errorHandler = 3;
8014 else if (!strcmp(errors, "xmlcharrefreplace"))
8015 *known_errorHandler = 4;
8016 else
8017 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018 }
8019 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008021 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008022 return -1;
8023 case 2: /* replace */
8024 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 x = charmapencode_output('?', mapping, res, respos);
8026 if (x==enc_EXCEPTION) {
8027 return -1;
8028 }
8029 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008030 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 return -1;
8032 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008033 }
8034 /* fall through */
8035 case 3: /* ignore */
8036 *inpos = collendpos;
8037 break;
8038 case 4: /* xmlcharrefreplace */
8039 /* generate replacement (temporarily (mis)uses p) */
8040 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 char buffer[2+29+1+1];
8042 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008043 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 for (cp = buffer; *cp; ++cp) {
8045 x = charmapencode_output(*cp, mapping, res, respos);
8046 if (x==enc_EXCEPTION)
8047 return -1;
8048 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008049 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 return -1;
8051 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 }
8053 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 *inpos = collendpos;
8055 break;
8056 default:
8057 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008058 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008062 if (PyBytes_Check(repunicode)) {
8063 /* Directly copy bytes result to output. */
8064 Py_ssize_t outsize = PyBytes_Size(*res);
8065 Py_ssize_t requiredsize;
8066 repsize = PyBytes_Size(repunicode);
8067 requiredsize = *respos + repsize;
8068 if (requiredsize > outsize)
8069 /* Make room for all additional bytes. */
8070 if (charmapencode_resize(res, respos, requiredsize)) {
8071 Py_DECREF(repunicode);
8072 return -1;
8073 }
8074 memcpy(PyBytes_AsString(*res) + *respos,
8075 PyBytes_AsString(repunicode), repsize);
8076 *respos += repsize;
8077 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008078 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008079 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008082 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008083 Py_DECREF(repunicode);
8084 return -1;
8085 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008086 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008087 data = PyUnicode_DATA(repunicode);
8088 kind = PyUnicode_KIND(repunicode);
8089 for (index = 0; index < repsize; index++) {
8090 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8091 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008093 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 return -1;
8095 }
8096 else if (x==enc_FAILED) {
8097 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008098 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 return -1;
8100 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008101 }
8102 *inpos = newpos;
8103 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 }
8105 return 0;
8106}
8107
Alexander Belopolsky40018472011-02-26 01:02:56 +00008108PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008109_PyUnicode_EncodeCharmap(PyObject *unicode,
8110 PyObject *mapping,
8111 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 /* output object */
8114 PyObject *res = NULL;
8115 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008116 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008119 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 PyObject *errorHandler = NULL;
8121 PyObject *exc = NULL;
8122 /* the following variable is used for caching string comparisons
8123 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8124 * 3=ignore, 4=xmlcharrefreplace */
8125 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Benjamin Petersonbac79492012-01-14 13:34:47 -05008127 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 return NULL;
8129 size = PyUnicode_GET_LENGTH(unicode);
8130
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 /* Default to Latin-1 */
8132 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008133 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 /* allocate enough for a simple encoding without
8136 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008137 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008138 if (res == NULL)
8139 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008140 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008146 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 if (x==enc_EXCEPTION) /* error */
8148 goto onError;
8149 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008150 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 &exc,
8152 &known_errorHandler, &errorHandler, errors,
8153 &res, &respos)) {
8154 goto onError;
8155 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008156 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 else
8158 /* done with this character => adjust input position */
8159 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008163 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008164 if (_PyBytes_Resize(&res, respos) < 0)
8165 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008166
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008167 Py_XDECREF(exc);
8168 Py_XDECREF(errorHandler);
8169 return res;
8170
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172 Py_XDECREF(res);
8173 Py_XDECREF(exc);
8174 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 return NULL;
8176}
8177
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178/* Deprecated */
8179PyObject *
8180PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8181 Py_ssize_t size,
8182 PyObject *mapping,
8183 const char *errors)
8184{
8185 PyObject *result;
8186 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8187 if (unicode == NULL)
8188 return NULL;
8189 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8190 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008191 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008192}
8193
Alexander Belopolsky40018472011-02-26 01:02:56 +00008194PyObject *
8195PyUnicode_AsCharmapString(PyObject *unicode,
8196 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
8198 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 PyErr_BadArgument();
8200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008202 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203}
8204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008206static void
8207make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008209 Py_ssize_t startpos, Py_ssize_t endpos,
8210 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 *exceptionObject = _PyUnicodeTranslateError_Create(
8214 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 }
8216 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8218 goto onError;
8219 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8220 goto onError;
8221 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8222 goto onError;
8223 return;
8224 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008225 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 }
8227}
8228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008230static void
8231raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233 Py_ssize_t startpos, Py_ssize_t endpos,
8234 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235{
8236 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240}
8241
8242/* error handling callback helper:
8243 build arguments, call the callback and check the arguments,
8244 put the result into newpos and return the replacement string, which
8245 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008246static PyObject *
8247unicode_translate_call_errorhandler(const char *errors,
8248 PyObject **errorHandler,
8249 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008251 Py_ssize_t startpos, Py_ssize_t endpos,
8252 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008254 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008256 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 PyObject *restuple;
8258 PyObject *resunicode;
8259
8260 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265
8266 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270
8271 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008276 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 Py_DECREF(restuple);
8278 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 }
8280 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 &resunicode, &i_newpos)) {
8282 Py_DECREF(restuple);
8283 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008285 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008287 else
8288 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8291 Py_DECREF(restuple);
8292 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008293 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 Py_INCREF(resunicode);
8295 Py_DECREF(restuple);
8296 return resunicode;
8297}
8298
8299/* Lookup the character ch in the mapping and put the result in result,
8300 which must be decrefed by the caller.
8301 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008303charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304{
Christian Heimes217cfd12007-12-02 14:31:20 +00008305 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 PyObject *x;
8307
8308 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310 x = PyObject_GetItem(mapping, w);
8311 Py_DECREF(w);
8312 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8314 /* No mapping found means: use 1:1 mapping. */
8315 PyErr_Clear();
8316 *result = NULL;
8317 return 0;
8318 } else
8319 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 }
8321 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 *result = x;
8323 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008325 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 long value = PyLong_AS_LONG(x);
8327 long max = PyUnicode_GetMax();
8328 if (value < 0 || value > max) {
8329 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008330 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 Py_DECREF(x);
8332 return -1;
8333 }
8334 *result = x;
8335 return 0;
8336 }
8337 else if (PyUnicode_Check(x)) {
8338 *result = x;
8339 return 0;
8340 }
8341 else {
8342 /* wrong return value */
8343 PyErr_SetString(PyExc_TypeError,
8344 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345 Py_DECREF(x);
8346 return -1;
8347 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348}
8349/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 if not reallocate and adjust various state variables.
8351 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008357 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008358 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 /* exponentially overallocate to minimize reallocations */
8360 if (requiredsize < 2 * oldsize)
8361 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008362 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8363 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008365 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 }
8368 return 0;
8369}
8370/* lookup the character, put the result in the output string and adjust
8371 various state variables. Return a new reference to the object that
8372 was put in the output buffer in *result, or Py_None, if the mapping was
8373 undefined (in which case no character was written).
8374 The called must decref result.
8375 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008376static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8378 PyObject *mapping, Py_UCS4 **output,
8379 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008380 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8383 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 }
8389 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008391 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 }
8395 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 Py_ssize_t repsize;
8397 if (PyUnicode_READY(*res) == -1)
8398 return -1;
8399 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 if (repsize==1) {
8401 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 }
8404 else if (repsize!=0) {
8405 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 Py_ssize_t requiredsize = *opos +
8407 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 Py_ssize_t i;
8410 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 for(i = 0; i < repsize; i++)
8413 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
8416 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 return 0;
8419}
8420
Alexander Belopolsky40018472011-02-26 01:02:56 +00008421PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422_PyUnicode_TranslateCharmap(PyObject *input,
8423 PyObject *mapping,
8424 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 /* input object */
8427 char *idata;
8428 Py_ssize_t size, i;
8429 int kind;
8430 /* output buffer */
8431 Py_UCS4 *output = NULL;
8432 Py_ssize_t osize;
8433 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 char *reason = "character maps to <undefined>";
8437 PyObject *errorHandler = NULL;
8438 PyObject *exc = NULL;
8439 /* the following variable is used for caching string comparisons
8440 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8441 * 3=ignore, 4=xmlcharrefreplace */
8442 int known_errorHandler = -1;
8443
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 PyErr_BadArgument();
8446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 if (PyUnicode_READY(input) == -1)
8450 return NULL;
8451 idata = (char*)PyUnicode_DATA(input);
8452 kind = PyUnicode_KIND(input);
8453 size = PyUnicode_GET_LENGTH(input);
8454 i = 0;
8455
8456 if (size == 0) {
8457 Py_INCREF(input);
8458 return input;
8459 }
8460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 /* allocate enough for a simple 1:1 translation without
8462 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 osize = size;
8464 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8465 opos = 0;
8466 if (output == NULL) {
8467 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 /* try to encode it */
8473 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 if (charmaptranslate_output(input, i, mapping,
8475 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 Py_XDECREF(x);
8477 goto onError;
8478 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 else { /* untranslatable character */
8483 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8484 Py_ssize_t repsize;
8485 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 Py_ssize_t collstart = i;
8489 Py_ssize_t collend = i+1;
8490 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008491
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 while (collend < size) {
8494 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 goto onError;
8496 Py_XDECREF(x);
8497 if (x!=Py_None)
8498 break;
8499 ++collend;
8500 }
8501 /* cache callback name lookup
8502 * (if not done yet, i.e. it's the first error) */
8503 if (known_errorHandler==-1) {
8504 if ((errors==NULL) || (!strcmp(errors, "strict")))
8505 known_errorHandler = 1;
8506 else if (!strcmp(errors, "replace"))
8507 known_errorHandler = 2;
8508 else if (!strcmp(errors, "ignore"))
8509 known_errorHandler = 3;
8510 else if (!strcmp(errors, "xmlcharrefreplace"))
8511 known_errorHandler = 4;
8512 else
8513 known_errorHandler = 0;
8514 }
8515 switch (known_errorHandler) {
8516 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 raise_translate_exception(&exc, input, collstart,
8518 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008519 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 case 2: /* replace */
8521 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 for (coll = collstart; coll<collend; coll++)
8523 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 /* fall through */
8525 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 break;
8528 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 /* generate replacement (temporarily (mis)uses i) */
8530 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 char buffer[2+29+1+1];
8532 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8534 if (charmaptranslate_makespace(&output, &osize,
8535 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 goto onError;
8537 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 break;
8542 default:
8543 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 reason, input, &exc,
8545 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008546 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008548 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008549 Py_DECREF(repunicode);
8550 goto onError;
8551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 repsize = PyUnicode_GET_LENGTH(repunicode);
8554 if (charmaptranslate_makespace(&output, &osize,
8555 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 Py_DECREF(repunicode);
8557 goto onError;
8558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 for (uni2 = 0; repsize-->0; ++uni2)
8560 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8561 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 }
8565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8567 if (!res)
8568 goto onError;
8569 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 Py_XDECREF(exc);
8571 Py_XDECREF(errorHandler);
8572 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 Py_XDECREF(exc);
8577 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 return NULL;
8579}
8580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581/* Deprecated. Use PyUnicode_Translate instead. */
8582PyObject *
8583PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8584 Py_ssize_t size,
8585 PyObject *mapping,
8586 const char *errors)
8587{
Christian Heimes5f520f42012-09-11 14:03:25 +02008588 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8590 if (!unicode)
8591 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008592 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8593 Py_DECREF(unicode);
8594 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595}
8596
Alexander Belopolsky40018472011-02-26 01:02:56 +00008597PyObject *
8598PyUnicode_Translate(PyObject *str,
8599 PyObject *mapping,
8600 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601{
8602 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008603
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 str = PyUnicode_FromObject(str);
8605 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008606 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 Py_DECREF(str);
8609 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610}
Tim Petersced69f82003-09-16 20:30:58 +00008611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008613fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614{
8615 /* No need to call PyUnicode_READY(self) because this function is only
8616 called as a callback from fixup() which does it already. */
8617 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8618 const int kind = PyUnicode_KIND(self);
8619 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008620 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 Py_ssize_t i;
8623
8624 for (i = 0; i < len; ++i) {
8625 ch = PyUnicode_READ(kind, data, i);
8626 fixed = 0;
8627 if (ch > 127) {
8628 if (Py_UNICODE_ISSPACE(ch))
8629 fixed = ' ';
8630 else {
8631 const int decimal = Py_UNICODE_TODECIMAL(ch);
8632 if (decimal >= 0)
8633 fixed = '0' + decimal;
8634 }
8635 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008636 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008637 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 PyUnicode_WRITE(kind, data, i, fixed);
8639 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008640 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008641 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 }
8644
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008645 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646}
8647
8648PyObject *
8649_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8650{
8651 if (!PyUnicode_Check(unicode)) {
8652 PyErr_BadInternalCall();
8653 return NULL;
8654 }
8655 if (PyUnicode_READY(unicode) == -1)
8656 return NULL;
8657 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8658 /* If the string is already ASCII, just return the same string */
8659 Py_INCREF(unicode);
8660 return unicode;
8661 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008662 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663}
8664
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008665PyObject *
8666PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8667 Py_ssize_t length)
8668{
Victor Stinnerf0124502011-11-21 23:12:56 +01008669 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008670 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008671 Py_UCS4 maxchar;
8672 enum PyUnicode_Kind kind;
8673 void *data;
8674
Victor Stinner99d7ad02012-02-22 13:37:39 +01008675 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008676 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008677 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008678 if (ch > 127) {
8679 int decimal = Py_UNICODE_TODECIMAL(ch);
8680 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008681 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008682 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008683 }
8684 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008685
8686 /* Copy to a new string */
8687 decimal = PyUnicode_New(length, maxchar);
8688 if (decimal == NULL)
8689 return decimal;
8690 kind = PyUnicode_KIND(decimal);
8691 data = PyUnicode_DATA(decimal);
8692 /* Iterate over code points */
8693 for (i = 0; i < length; i++) {
8694 Py_UNICODE ch = s[i];
8695 if (ch > 127) {
8696 int decimal = Py_UNICODE_TODECIMAL(ch);
8697 if (decimal >= 0)
8698 ch = '0' + decimal;
8699 }
8700 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008702 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008703}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008704/* --- Decimal Encoder ---------------------------------------------------- */
8705
Alexander Belopolsky40018472011-02-26 01:02:56 +00008706int
8707PyUnicode_EncodeDecimal(Py_UNICODE *s,
8708 Py_ssize_t length,
8709 char *output,
8710 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008711{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008712 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008713 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008714 enum PyUnicode_Kind kind;
8715 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008716
8717 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 PyErr_BadArgument();
8719 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008720 }
8721
Victor Stinner42bf7752011-11-21 22:52:58 +01008722 unicode = PyUnicode_FromUnicode(s, length);
8723 if (unicode == NULL)
8724 return -1;
8725
Benjamin Petersonbac79492012-01-14 13:34:47 -05008726 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008727 Py_DECREF(unicode);
8728 return -1;
8729 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008730 kind = PyUnicode_KIND(unicode);
8731 data = PyUnicode_DATA(unicode);
8732
Victor Stinnerb84d7232011-11-22 01:50:07 +01008733 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008734 PyObject *exc;
8735 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008737 Py_ssize_t startpos;
8738
8739 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008740
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008742 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008743 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 decimal = Py_UNICODE_TODECIMAL(ch);
8747 if (decimal >= 0) {
8748 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008749 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 continue;
8751 }
8752 if (0 < ch && ch < 256) {
8753 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008754 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 continue;
8756 }
Victor Stinner6345be92011-11-25 20:09:01 +01008757
Victor Stinner42bf7752011-11-21 22:52:58 +01008758 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008759 exc = NULL;
8760 raise_encode_exception(&exc, "decimal", unicode,
8761 startpos, startpos+1,
8762 "invalid decimal Unicode string");
8763 Py_XDECREF(exc);
8764 Py_DECREF(unicode);
8765 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008766 }
8767 /* 0-terminate the output string */
8768 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008769 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008770 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008771}
8772
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773/* --- Helpers ------------------------------------------------------------ */
8774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008776any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 Py_ssize_t start,
8778 Py_ssize_t end)
8779{
8780 int kind1, kind2, kind;
8781 void *buf1, *buf2;
8782 Py_ssize_t len1, len2, result;
8783
8784 kind1 = PyUnicode_KIND(s1);
8785 kind2 = PyUnicode_KIND(s2);
8786 kind = kind1 > kind2 ? kind1 : kind2;
8787 buf1 = PyUnicode_DATA(s1);
8788 buf2 = PyUnicode_DATA(s2);
8789 if (kind1 != kind)
8790 buf1 = _PyUnicode_AsKind(s1, kind);
8791 if (!buf1)
8792 return -2;
8793 if (kind2 != kind)
8794 buf2 = _PyUnicode_AsKind(s2, kind);
8795 if (!buf2) {
8796 if (kind1 != kind) PyMem_Free(buf1);
8797 return -2;
8798 }
8799 len1 = PyUnicode_GET_LENGTH(s1);
8800 len2 = PyUnicode_GET_LENGTH(s2);
8801
Victor Stinner794d5672011-10-10 03:21:36 +02008802 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008803 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008804 case PyUnicode_1BYTE_KIND:
8805 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8806 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8807 else
8808 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8809 break;
8810 case PyUnicode_2BYTE_KIND:
8811 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8812 break;
8813 case PyUnicode_4BYTE_KIND:
8814 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8815 break;
8816 default:
8817 assert(0); result = -2;
8818 }
8819 }
8820 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008821 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008822 case PyUnicode_1BYTE_KIND:
8823 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8824 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8825 else
8826 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8827 break;
8828 case PyUnicode_2BYTE_KIND:
8829 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8830 break;
8831 case PyUnicode_4BYTE_KIND:
8832 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8833 break;
8834 default:
8835 assert(0); result = -2;
8836 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 }
8838
8839 if (kind1 != kind)
8840 PyMem_Free(buf1);
8841 if (kind2 != kind)
8842 PyMem_Free(buf2);
8843
8844 return result;
8845}
8846
8847Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008848_PyUnicode_InsertThousandsGrouping(
8849 PyObject *unicode, Py_ssize_t index,
8850 Py_ssize_t n_buffer,
8851 void *digits, Py_ssize_t n_digits,
8852 Py_ssize_t min_width,
8853 const char *grouping, PyObject *thousands_sep,
8854 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855{
Victor Stinner41a863c2012-02-24 00:37:51 +01008856 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008857 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008858 Py_ssize_t thousands_sep_len;
8859 Py_ssize_t len;
8860
8861 if (unicode != NULL) {
8862 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008863 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008864 }
8865 else {
8866 kind = PyUnicode_1BYTE_KIND;
8867 data = NULL;
8868 }
8869 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8870 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8871 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8872 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008873 if (thousands_sep_kind < kind) {
8874 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8875 if (!thousands_sep_data)
8876 return -1;
8877 }
8878 else {
8879 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8880 if (!data)
8881 return -1;
8882 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 }
8884
Benjamin Petersonead6b532011-12-20 17:23:42 -06008885 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008887 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008889 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008891 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008892 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008893 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008894 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008895 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008896 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008897 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008899 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008900 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008901 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008902 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008903 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008905 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008906 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008907 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008908 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008909 break;
8910 default:
8911 assert(0);
8912 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008914 if (unicode != NULL && thousands_sep_kind != kind) {
8915 if (thousands_sep_kind < kind)
8916 PyMem_Free(thousands_sep_data);
8917 else
8918 PyMem_Free(data);
8919 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008920 if (unicode == NULL) {
8921 *maxchar = 127;
8922 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008923 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008924 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008925 }
8926 }
8927 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928}
8929
8930
Thomas Wouters477c8d52006-05-27 19:21:47 +00008931/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008932#define ADJUST_INDICES(start, end, len) \
8933 if (end > len) \
8934 end = len; \
8935 else if (end < 0) { \
8936 end += len; \
8937 if (end < 0) \
8938 end = 0; \
8939 } \
8940 if (start < 0) { \
8941 start += len; \
8942 if (start < 0) \
8943 start = 0; \
8944 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008945
Alexander Belopolsky40018472011-02-26 01:02:56 +00008946Py_ssize_t
8947PyUnicode_Count(PyObject *str,
8948 PyObject *substr,
8949 Py_ssize_t start,
8950 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008952 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008953 PyObject* str_obj;
8954 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 int kind1, kind2, kind;
8956 void *buf1 = NULL, *buf2 = NULL;
8957 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008958
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008959 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008960 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008962 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008963 if (!sub_obj) {
8964 Py_DECREF(str_obj);
8965 return -1;
8966 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008967 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008968 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 Py_DECREF(str_obj);
8970 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 }
Tim Petersced69f82003-09-16 20:30:58 +00008972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 kind1 = PyUnicode_KIND(str_obj);
8974 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008975 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008978 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008979 if (kind2 > kind) {
8980 Py_DECREF(sub_obj);
8981 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008982 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008983 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008984 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 if (!buf2)
8987 goto onError;
8988 len1 = PyUnicode_GET_LENGTH(str_obj);
8989 len2 = PyUnicode_GET_LENGTH(sub_obj);
8990
8991 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008992 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008994 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8995 result = asciilib_count(
8996 ((Py_UCS1*)buf1) + start, end - start,
8997 buf2, len2, PY_SSIZE_T_MAX
8998 );
8999 else
9000 result = ucs1lib_count(
9001 ((Py_UCS1*)buf1) + start, end - start,
9002 buf2, len2, PY_SSIZE_T_MAX
9003 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 break;
9005 case PyUnicode_2BYTE_KIND:
9006 result = ucs2lib_count(
9007 ((Py_UCS2*)buf1) + start, end - start,
9008 buf2, len2, PY_SSIZE_T_MAX
9009 );
9010 break;
9011 case PyUnicode_4BYTE_KIND:
9012 result = ucs4lib_count(
9013 ((Py_UCS4*)buf1) + start, end - start,
9014 buf2, len2, PY_SSIZE_T_MAX
9015 );
9016 break;
9017 default:
9018 assert(0); result = 0;
9019 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009020
9021 Py_DECREF(sub_obj);
9022 Py_DECREF(str_obj);
9023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 if (kind2 != kind)
9025 PyMem_Free(buf2);
9026
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 onError:
9029 Py_DECREF(sub_obj);
9030 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 if (kind2 != kind && buf2)
9032 PyMem_Free(buf2);
9033 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034}
9035
Alexander Belopolsky40018472011-02-26 01:02:56 +00009036Py_ssize_t
9037PyUnicode_Find(PyObject *str,
9038 PyObject *sub,
9039 Py_ssize_t start,
9040 Py_ssize_t end,
9041 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009043 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009044
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009046 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009048 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009049 if (!sub) {
9050 Py_DECREF(str);
9051 return -2;
9052 }
9053 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9054 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 Py_DECREF(str);
9056 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 }
Tim Petersced69f82003-09-16 20:30:58 +00009058
Victor Stinner794d5672011-10-10 03:21:36 +02009059 result = any_find_slice(direction,
9060 str, sub, start, end
9061 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009062
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009064 Py_DECREF(sub);
9065
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066 return result;
9067}
9068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069Py_ssize_t
9070PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9071 Py_ssize_t start, Py_ssize_t end,
9072 int direction)
9073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009075 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 if (PyUnicode_READY(str) == -1)
9077 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009078 if (start < 0 || end < 0) {
9079 PyErr_SetString(PyExc_IndexError, "string index out of range");
9080 return -2;
9081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 if (end > PyUnicode_GET_LENGTH(str))
9083 end = PyUnicode_GET_LENGTH(str);
9084 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009085 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9086 kind, end-start, ch, direction);
9087 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009089 else
9090 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091}
9092
Alexander Belopolsky40018472011-02-26 01:02:56 +00009093static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009094tailmatch(PyObject *self,
9095 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009096 Py_ssize_t start,
9097 Py_ssize_t end,
9098 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 int kind_self;
9101 int kind_sub;
9102 void *data_self;
9103 void *data_sub;
9104 Py_ssize_t offset;
9105 Py_ssize_t i;
9106 Py_ssize_t end_sub;
9107
9108 if (PyUnicode_READY(self) == -1 ||
9109 PyUnicode_READY(substring) == -1)
9110 return 0;
9111
9112 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113 return 1;
9114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9116 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 kind_self = PyUnicode_KIND(self);
9121 data_self = PyUnicode_DATA(self);
9122 kind_sub = PyUnicode_KIND(substring);
9123 data_sub = PyUnicode_DATA(substring);
9124 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9125
9126 if (direction > 0)
9127 offset = end;
9128 else
9129 offset = start;
9130
9131 if (PyUnicode_READ(kind_self, data_self, offset) ==
9132 PyUnicode_READ(kind_sub, data_sub, 0) &&
9133 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9134 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9135 /* If both are of the same kind, memcmp is sufficient */
9136 if (kind_self == kind_sub) {
9137 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009138 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 data_sub,
9140 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009141 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 }
9143 /* otherwise we have to compare each character by first accesing it */
9144 else {
9145 /* We do not need to compare 0 and len(substring)-1 because
9146 the if statement above ensured already that they are equal
9147 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009148 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149 for (i = 1; i < end_sub; ++i) {
9150 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9151 PyUnicode_READ(kind_sub, data_sub, i))
9152 return 0;
9153 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 }
9157
9158 return 0;
9159}
9160
Alexander Belopolsky40018472011-02-26 01:02:56 +00009161Py_ssize_t
9162PyUnicode_Tailmatch(PyObject *str,
9163 PyObject *substr,
9164 Py_ssize_t start,
9165 Py_ssize_t end,
9166 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009168 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009169
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 str = PyUnicode_FromObject(str);
9171 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 substr = PyUnicode_FromObject(substr);
9174 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 Py_DECREF(str);
9176 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177 }
Tim Petersced69f82003-09-16 20:30:58 +00009178
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009179 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 Py_DECREF(str);
9182 Py_DECREF(substr);
9183 return result;
9184}
9185
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186/* Apply fixfct filter to the Unicode object self and return a
9187 reference to the modified object */
9188
Alexander Belopolsky40018472011-02-26 01:02:56 +00009189static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009190fixup(PyObject *self,
9191 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 PyObject *u;
9194 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009195 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009197 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009200 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 /* fix functions return the new maximum character in a string,
9203 if the kind of the resulting unicode object does not change,
9204 everything is fine. Otherwise we need to change the string kind
9205 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009206 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009207
9208 if (maxchar_new == 0) {
9209 /* no changes */;
9210 if (PyUnicode_CheckExact(self)) {
9211 Py_DECREF(u);
9212 Py_INCREF(self);
9213 return self;
9214 }
9215 else
9216 return u;
9217 }
9218
Victor Stinnere6abb482012-05-02 01:15:40 +02009219 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220
Victor Stinnereaab6042011-12-11 22:22:39 +01009221 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009223
9224 /* In case the maximum character changed, we need to
9225 convert the string to the new category. */
9226 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9227 if (v == NULL) {
9228 Py_DECREF(u);
9229 return NULL;
9230 }
9231 if (maxchar_new > maxchar_old) {
9232 /* If the maxchar increased so that the kind changed, not all
9233 characters are representable anymore and we need to fix the
9234 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009235 _PyUnicode_FastCopyCharacters(v, 0,
9236 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009237 maxchar_old = fixfct(v);
9238 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 }
9240 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009241 _PyUnicode_FastCopyCharacters(v, 0,
9242 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009244 Py_DECREF(u);
9245 assert(_PyUnicode_CheckConsistency(v, 1));
9246 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247}
9248
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009249static PyObject *
9250ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009252 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9253 char *resdata, *data = PyUnicode_DATA(self);
9254 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009255
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009256 res = PyUnicode_New(len, 127);
9257 if (res == NULL)
9258 return NULL;
9259 resdata = PyUnicode_DATA(res);
9260 if (lower)
9261 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009263 _Py_bytes_upper(resdata, data, len);
9264 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265}
9266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009268handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009270 Py_ssize_t j;
9271 int final_sigma;
9272 Py_UCS4 c;
9273 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009274
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009275 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9276
9277 where ! is a negation and \p{xxx} is a character with property xxx.
9278 */
9279 for (j = i - 1; j >= 0; j--) {
9280 c = PyUnicode_READ(kind, data, j);
9281 if (!_PyUnicode_IsCaseIgnorable(c))
9282 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009284 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9285 if (final_sigma) {
9286 for (j = i + 1; j < length; j++) {
9287 c = PyUnicode_READ(kind, data, j);
9288 if (!_PyUnicode_IsCaseIgnorable(c))
9289 break;
9290 }
9291 final_sigma = j == length || !_PyUnicode_IsCased(c);
9292 }
9293 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294}
9295
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296static int
9297lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9298 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009300 /* Obscure special case. */
9301 if (c == 0x3A3) {
9302 mapped[0] = handle_capital_sigma(kind, data, length, i);
9303 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009305 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306}
9307
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308static Py_ssize_t
9309do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009311 Py_ssize_t i, k = 0;
9312 int n_res, j;
9313 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009314
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009315 c = PyUnicode_READ(kind, data, 0);
9316 n_res = _PyUnicode_ToUpperFull(c, mapped);
9317 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009318 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009319 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009321 for (i = 1; i < length; i++) {
9322 c = PyUnicode_READ(kind, data, i);
9323 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9324 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009325 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009326 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009327 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009328 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009329 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330}
9331
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009332static Py_ssize_t
9333do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9334 Py_ssize_t i, k = 0;
9335
9336 for (i = 0; i < length; i++) {
9337 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9338 int n_res, j;
9339 if (Py_UNICODE_ISUPPER(c)) {
9340 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9341 }
9342 else if (Py_UNICODE_ISLOWER(c)) {
9343 n_res = _PyUnicode_ToUpperFull(c, mapped);
9344 }
9345 else {
9346 n_res = 1;
9347 mapped[0] = c;
9348 }
9349 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009350 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009351 res[k++] = mapped[j];
9352 }
9353 }
9354 return k;
9355}
9356
9357static Py_ssize_t
9358do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9359 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009361 Py_ssize_t i, k = 0;
9362
9363 for (i = 0; i < length; i++) {
9364 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9365 int n_res, j;
9366 if (lower)
9367 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9368 else
9369 n_res = _PyUnicode_ToUpperFull(c, mapped);
9370 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009371 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009372 res[k++] = mapped[j];
9373 }
9374 }
9375 return k;
9376}
9377
9378static Py_ssize_t
9379do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9380{
9381 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9382}
9383
9384static Py_ssize_t
9385do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9386{
9387 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9388}
9389
Benjamin Petersone51757f2012-01-12 21:10:29 -05009390static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009391do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9392{
9393 Py_ssize_t i, k = 0;
9394
9395 for (i = 0; i < length; i++) {
9396 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9397 Py_UCS4 mapped[3];
9398 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9399 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009400 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009401 res[k++] = mapped[j];
9402 }
9403 }
9404 return k;
9405}
9406
9407static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009408do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9409{
9410 Py_ssize_t i, k = 0;
9411 int previous_is_cased;
9412
9413 previous_is_cased = 0;
9414 for (i = 0; i < length; i++) {
9415 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9416 Py_UCS4 mapped[3];
9417 int n_res, j;
9418
9419 if (previous_is_cased)
9420 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9421 else
9422 n_res = _PyUnicode_ToTitleFull(c, mapped);
9423
9424 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009425 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009426 res[k++] = mapped[j];
9427 }
9428
9429 previous_is_cased = _PyUnicode_IsCased(c);
9430 }
9431 return k;
9432}
9433
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009434static PyObject *
9435case_operation(PyObject *self,
9436 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9437{
9438 PyObject *res = NULL;
9439 Py_ssize_t length, newlength = 0;
9440 int kind, outkind;
9441 void *data, *outdata;
9442 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9443
Benjamin Petersoneea48462012-01-16 14:28:50 -05009444 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009445
9446 kind = PyUnicode_KIND(self);
9447 data = PyUnicode_DATA(self);
9448 length = PyUnicode_GET_LENGTH(self);
9449 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9450 if (tmp == NULL)
9451 return PyErr_NoMemory();
9452 newlength = perform(kind, data, length, tmp, &maxchar);
9453 res = PyUnicode_New(newlength, maxchar);
9454 if (res == NULL)
9455 goto leave;
9456 tmpend = tmp + newlength;
9457 outdata = PyUnicode_DATA(res);
9458 outkind = PyUnicode_KIND(res);
9459 switch (outkind) {
9460 case PyUnicode_1BYTE_KIND:
9461 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9462 break;
9463 case PyUnicode_2BYTE_KIND:
9464 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9465 break;
9466 case PyUnicode_4BYTE_KIND:
9467 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9468 break;
9469 default:
9470 assert(0);
9471 break;
9472 }
9473 leave:
9474 PyMem_FREE(tmp);
9475 return res;
9476}
9477
Tim Peters8ce9f162004-08-27 01:49:32 +00009478PyObject *
9479PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009482 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009484 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009485 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9486 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009487 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009489 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009491 int use_memcpy;
9492 unsigned char *res_data = NULL, *sep_data = NULL;
9493 PyObject *last_obj;
9494 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495
Tim Peters05eba1f2004-08-27 21:32:02 +00009496 fseq = PySequence_Fast(seq, "");
9497 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009498 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009499 }
9500
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009501 /* NOTE: the following code can't call back into Python code,
9502 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009503 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009504
Tim Peters05eba1f2004-08-27 21:32:02 +00009505 seqlen = PySequence_Fast_GET_SIZE(fseq);
9506 /* If empty sequence, return u"". */
9507 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009508 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009509 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009510 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009511
Tim Peters05eba1f2004-08-27 21:32:02 +00009512 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009513 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009514 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009515 if (seqlen == 1) {
9516 if (PyUnicode_CheckExact(items[0])) {
9517 res = items[0];
9518 Py_INCREF(res);
9519 Py_DECREF(fseq);
9520 return res;
9521 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009522 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009523 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009524 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009525 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009526 /* Set up sep and seplen */
9527 if (separator == NULL) {
9528 /* fall back to a blank space separator */
9529 sep = PyUnicode_FromOrdinal(' ');
9530 if (!sep)
9531 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009532 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009533 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009534 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009535 else {
9536 if (!PyUnicode_Check(separator)) {
9537 PyErr_Format(PyExc_TypeError,
9538 "separator: expected str instance,"
9539 " %.80s found",
9540 Py_TYPE(separator)->tp_name);
9541 goto onError;
9542 }
9543 if (PyUnicode_READY(separator))
9544 goto onError;
9545 sep = separator;
9546 seplen = PyUnicode_GET_LENGTH(separator);
9547 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9548 /* inc refcount to keep this code path symmetric with the
9549 above case of a blank separator */
9550 Py_INCREF(sep);
9551 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009552 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009553 }
9554
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 /* There are at least two things to join, or else we have a subclass
9556 * of str in the sequence.
9557 * Do a pre-pass to figure out the total amount of space we'll
9558 * need (sz), and see whether all argument are strings.
9559 */
9560 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009561#ifdef Py_DEBUG
9562 use_memcpy = 0;
9563#else
9564 use_memcpy = 1;
9565#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009566 for (i = 0; i < seqlen; i++) {
9567 const Py_ssize_t old_sz = sz;
9568 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 if (!PyUnicode_Check(item)) {
9570 PyErr_Format(PyExc_TypeError,
9571 "sequence item %zd: expected str instance,"
9572 " %.80s found",
9573 i, Py_TYPE(item)->tp_name);
9574 goto onError;
9575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 if (PyUnicode_READY(item) == -1)
9577 goto onError;
9578 sz += PyUnicode_GET_LENGTH(item);
9579 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009580 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 if (i != 0)
9582 sz += seplen;
9583 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9584 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009586 goto onError;
9587 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009588 if (use_memcpy && last_obj != NULL) {
9589 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9590 use_memcpy = 0;
9591 }
9592 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009593 }
Tim Petersced69f82003-09-16 20:30:58 +00009594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 if (res == NULL)
9597 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009598
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009599 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009600#ifdef Py_DEBUG
9601 use_memcpy = 0;
9602#else
9603 if (use_memcpy) {
9604 res_data = PyUnicode_1BYTE_DATA(res);
9605 kind = PyUnicode_KIND(res);
9606 if (seplen != 0)
9607 sep_data = PyUnicode_1BYTE_DATA(sep);
9608 }
9609#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009611 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009612 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009614 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009615 if (use_memcpy) {
9616 Py_MEMCPY(res_data,
9617 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009618 kind * seplen);
9619 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 }
9621 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009622 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009623 res_offset += seplen;
9624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009626 itemlen = PyUnicode_GET_LENGTH(item);
9627 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 if (use_memcpy) {
9629 Py_MEMCPY(res_data,
9630 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009631 kind * itemlen);
9632 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009633 }
9634 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009635 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009636 res_offset += itemlen;
9637 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009638 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009639 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009640 if (use_memcpy)
9641 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009642 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009643 else
9644 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009645
Tim Peters05eba1f2004-08-27 21:32:02 +00009646 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009648 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650
Benjamin Peterson29060642009-01-31 22:14:21 +00009651 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009652 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009654 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 return NULL;
9656}
9657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658#define FILL(kind, data, value, start, length) \
9659 do { \
9660 Py_ssize_t i_ = 0; \
9661 assert(kind != PyUnicode_WCHAR_KIND); \
9662 switch ((kind)) { \
9663 case PyUnicode_1BYTE_KIND: { \
9664 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009665 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 break; \
9667 } \
9668 case PyUnicode_2BYTE_KIND: { \
9669 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9670 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9671 break; \
9672 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009673 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9675 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9676 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009677 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 } \
9679 } \
9680 } while (0)
9681
Victor Stinnerd3f08822012-05-29 12:57:52 +02009682void
9683_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9684 Py_UCS4 fill_char)
9685{
9686 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9687 const void *data = PyUnicode_DATA(unicode);
9688 assert(PyUnicode_IS_READY(unicode));
9689 assert(unicode_modifiable(unicode));
9690 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9691 assert(start >= 0);
9692 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9693 FILL(kind, data, fill_char, start, length);
9694}
9695
Victor Stinner3fe55312012-01-04 00:33:50 +01009696Py_ssize_t
9697PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9698 Py_UCS4 fill_char)
9699{
9700 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009701
9702 if (!PyUnicode_Check(unicode)) {
9703 PyErr_BadInternalCall();
9704 return -1;
9705 }
9706 if (PyUnicode_READY(unicode) == -1)
9707 return -1;
9708 if (unicode_check_modifiable(unicode))
9709 return -1;
9710
Victor Stinnerd3f08822012-05-29 12:57:52 +02009711 if (start < 0) {
9712 PyErr_SetString(PyExc_IndexError, "string index out of range");
9713 return -1;
9714 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009715 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9716 PyErr_SetString(PyExc_ValueError,
9717 "fill character is bigger than "
9718 "the string maximum character");
9719 return -1;
9720 }
9721
9722 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9723 length = Py_MIN(maxlen, length);
9724 if (length <= 0)
9725 return 0;
9726
Victor Stinnerd3f08822012-05-29 12:57:52 +02009727 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009728 return length;
9729}
9730
Victor Stinner9310abb2011-10-05 00:59:23 +02009731static PyObject *
9732pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009733 Py_ssize_t left,
9734 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 PyObject *u;
9738 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009739 int kind;
9740 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741
9742 if (left < 0)
9743 left = 0;
9744 if (right < 0)
9745 right = 0;
9746
Victor Stinnerc4b49542011-12-11 22:44:26 +01009747 if (left == 0 && right == 0)
9748 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9751 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009752 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9753 return NULL;
9754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009756 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009758 if (!u)
9759 return NULL;
9760
9761 kind = PyUnicode_KIND(u);
9762 data = PyUnicode_DATA(u);
9763 if (left)
9764 FILL(kind, data, fill, 0, left);
9765 if (right)
9766 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009767 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009768 assert(_PyUnicode_CheckConsistency(u, 1));
9769 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770}
9771
Alexander Belopolsky40018472011-02-26 01:02:56 +00009772PyObject *
9773PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
9777 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009778 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009779 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009780 if (PyUnicode_READY(string) == -1) {
9781 Py_DECREF(string);
9782 return NULL;
9783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784
Benjamin Petersonead6b532011-12-20 17:23:42 -06009785 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009787 if (PyUnicode_IS_ASCII(string))
9788 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009789 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009790 PyUnicode_GET_LENGTH(string), keepends);
9791 else
9792 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009793 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009794 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 break;
9796 case PyUnicode_2BYTE_KIND:
9797 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009798 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 PyUnicode_GET_LENGTH(string), keepends);
9800 break;
9801 case PyUnicode_4BYTE_KIND:
9802 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 PyUnicode_GET_LENGTH(string), keepends);
9805 break;
9806 default:
9807 assert(0);
9808 list = 0;
9809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810 Py_DECREF(string);
9811 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812}
9813
Alexander Belopolsky40018472011-02-26 01:02:56 +00009814static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009815split(PyObject *self,
9816 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009817 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 int kind1, kind2, kind;
9820 void *buf1, *buf2;
9821 Py_ssize_t len1, len2;
9822 PyObject* out;
9823
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009825 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 if (PyUnicode_READY(self) == -1)
9828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009831 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009833 if (PyUnicode_IS_ASCII(self))
9834 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009836 PyUnicode_GET_LENGTH(self), maxcount
9837 );
9838 else
9839 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009840 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009841 PyUnicode_GET_LENGTH(self), maxcount
9842 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 case PyUnicode_2BYTE_KIND:
9844 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009845 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 PyUnicode_GET_LENGTH(self), maxcount
9847 );
9848 case PyUnicode_4BYTE_KIND:
9849 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009850 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 PyUnicode_GET_LENGTH(self), maxcount
9852 );
9853 default:
9854 assert(0);
9855 return NULL;
9856 }
9857
9858 if (PyUnicode_READY(substring) == -1)
9859 return NULL;
9860
9861 kind1 = PyUnicode_KIND(self);
9862 kind2 = PyUnicode_KIND(substring);
9863 kind = kind1 > kind2 ? kind1 : kind2;
9864 buf1 = PyUnicode_DATA(self);
9865 buf2 = PyUnicode_DATA(substring);
9866 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009867 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 if (!buf1)
9869 return NULL;
9870 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 if (!buf2) {
9873 if (kind1 != kind) PyMem_Free(buf1);
9874 return NULL;
9875 }
9876 len1 = PyUnicode_GET_LENGTH(self);
9877 len2 = PyUnicode_GET_LENGTH(substring);
9878
Benjamin Petersonead6b532011-12-20 17:23:42 -06009879 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009881 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9882 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009883 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009884 else
9885 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009886 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 break;
9888 case PyUnicode_2BYTE_KIND:
9889 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009890 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 break;
9892 case PyUnicode_4BYTE_KIND:
9893 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009894 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 break;
9896 default:
9897 out = NULL;
9898 }
9899 if (kind1 != kind)
9900 PyMem_Free(buf1);
9901 if (kind2 != kind)
9902 PyMem_Free(buf2);
9903 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904}
9905
Alexander Belopolsky40018472011-02-26 01:02:56 +00009906static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009907rsplit(PyObject *self,
9908 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009909 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 int kind1, kind2, kind;
9912 void *buf1, *buf2;
9913 Py_ssize_t len1, len2;
9914 PyObject* out;
9915
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009916 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009917 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 if (PyUnicode_READY(self) == -1)
9920 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009923 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009925 if (PyUnicode_IS_ASCII(self))
9926 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 else
9931 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009933 PyUnicode_GET_LENGTH(self), maxcount
9934 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 case PyUnicode_2BYTE_KIND:
9936 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009937 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 PyUnicode_GET_LENGTH(self), maxcount
9939 );
9940 case PyUnicode_4BYTE_KIND:
9941 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009942 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 PyUnicode_GET_LENGTH(self), maxcount
9944 );
9945 default:
9946 assert(0);
9947 return NULL;
9948 }
9949
9950 if (PyUnicode_READY(substring) == -1)
9951 return NULL;
9952
9953 kind1 = PyUnicode_KIND(self);
9954 kind2 = PyUnicode_KIND(substring);
9955 kind = kind1 > kind2 ? kind1 : kind2;
9956 buf1 = PyUnicode_DATA(self);
9957 buf2 = PyUnicode_DATA(substring);
9958 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009959 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 if (!buf1)
9961 return NULL;
9962 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 if (!buf2) {
9965 if (kind1 != kind) PyMem_Free(buf1);
9966 return NULL;
9967 }
9968 len1 = PyUnicode_GET_LENGTH(self);
9969 len2 = PyUnicode_GET_LENGTH(substring);
9970
Benjamin Petersonead6b532011-12-20 17:23:42 -06009971 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009973 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9974 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009975 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009976 else
9977 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009978 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 break;
9980 case PyUnicode_2BYTE_KIND:
9981 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009982 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 break;
9984 case PyUnicode_4BYTE_KIND:
9985 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009986 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 break;
9988 default:
9989 out = NULL;
9990 }
9991 if (kind1 != kind)
9992 PyMem_Free(buf1);
9993 if (kind2 != kind)
9994 PyMem_Free(buf2);
9995 return out;
9996}
9997
9998static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10000 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010002 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10005 return asciilib_find(buf1, len1, buf2, len2, offset);
10006 else
10007 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 case PyUnicode_2BYTE_KIND:
10009 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10010 case PyUnicode_4BYTE_KIND:
10011 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10012 }
10013 assert(0);
10014 return -1;
10015}
10016
10017static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010018anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10019 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010021 switch (kind) {
10022 case PyUnicode_1BYTE_KIND:
10023 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10024 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10025 else
10026 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10027 case PyUnicode_2BYTE_KIND:
10028 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10029 case PyUnicode_4BYTE_KIND:
10030 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10031 }
10032 assert(0);
10033 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010034}
10035
Alexander Belopolsky40018472011-02-26 01:02:56 +000010036static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037replace(PyObject *self, PyObject *str1,
10038 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 PyObject *u;
10041 char *sbuf = PyUnicode_DATA(self);
10042 char *buf1 = PyUnicode_DATA(str1);
10043 char *buf2 = PyUnicode_DATA(str2);
10044 int srelease = 0, release1 = 0, release2 = 0;
10045 int skind = PyUnicode_KIND(self);
10046 int kind1 = PyUnicode_KIND(str1);
10047 int kind2 = PyUnicode_KIND(str2);
10048 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10049 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10050 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010051 int mayshrink;
10052 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053
10054 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010055 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010057 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058
Victor Stinner59de0ee2011-10-07 10:01:28 +020010059 if (str1 == str2)
10060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (skind < kind1)
10062 /* substring too wide to be present */
10063 goto nothing;
10064
Victor Stinner49a0a212011-10-12 23:46:10 +020010065 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10066 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10067 /* Replacing str1 with str2 may cause a maxchar reduction in the
10068 result string. */
10069 mayshrink = (maxchar_str2 < maxchar);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010070 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010073 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010075 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010077 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010078 Py_UCS4 u1, u2;
10079 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010080 Py_ssize_t index, pos;
10081 char *src;
10082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010084 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10085 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010086 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010089 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010091 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010093
10094 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10095 index = 0;
10096 src = sbuf;
10097 while (--maxcount)
10098 {
10099 pos++;
10100 src += pos * PyUnicode_KIND(self);
10101 slen -= pos;
10102 index += pos;
10103 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10104 if (pos < 0)
10105 break;
10106 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10107 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010108 }
10109 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 int rkind = skind;
10111 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010112 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (kind1 < rkind) {
10115 /* widen substring */
10116 buf1 = _PyUnicode_AsKind(str1, rkind);
10117 if (!buf1) goto error;
10118 release1 = 1;
10119 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010120 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010121 if (i < 0)
10122 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (rkind > kind2) {
10124 /* widen replacement */
10125 buf2 = _PyUnicode_AsKind(str2, rkind);
10126 if (!buf2) goto error;
10127 release2 = 1;
10128 }
10129 else if (rkind < kind2) {
10130 /* widen self and buf1 */
10131 rkind = kind2;
10132 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010133 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 sbuf = _PyUnicode_AsKind(self, rkind);
10135 if (!sbuf) goto error;
10136 srelease = 1;
10137 buf1 = _PyUnicode_AsKind(str1, rkind);
10138 if (!buf1) goto error;
10139 release1 = 1;
10140 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010141 u = PyUnicode_New(slen, maxchar);
10142 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010144 assert(PyUnicode_KIND(u) == rkind);
10145 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010146
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010148 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010153
10154 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010155 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010156 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010158 if (i == -1)
10159 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010160 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010162 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010164 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 }
10167 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010169 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 int rkind = skind;
10171 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 buf1 = _PyUnicode_AsKind(str1, rkind);
10176 if (!buf1) goto error;
10177 release1 = 1;
10178 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010180 if (n == 0)
10181 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010183 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 buf2 = _PyUnicode_AsKind(str2, rkind);
10185 if (!buf2) goto error;
10186 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010189 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 rkind = kind2;
10191 sbuf = _PyUnicode_AsKind(self, rkind);
10192 if (!sbuf) goto error;
10193 srelease = 1;
10194 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010195 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 buf1 = _PyUnicode_AsKind(str1, rkind);
10197 if (!buf1) goto error;
10198 release1 = 1;
10199 }
10200 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10201 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010202 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyErr_SetString(PyExc_OverflowError,
10204 "replace string is too long");
10205 goto error;
10206 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010207 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010209 _Py_INCREF_UNICODE_EMPTY();
10210 if (!unicode_empty)
10211 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010212 u = unicode_empty;
10213 goto done;
10214 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010215 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 PyErr_SetString(PyExc_OverflowError,
10217 "replace string is too long");
10218 goto error;
10219 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010220 u = PyUnicode_New(new_size, maxchar);
10221 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010223 assert(PyUnicode_KIND(u) == rkind);
10224 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 ires = i = 0;
10226 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010227 while (n-- > 0) {
10228 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010232 if (j == -1)
10233 break;
10234 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010235 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010236 memcpy(res + rkind * ires,
10237 sbuf + rkind * i,
10238 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240 }
10241 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010243 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010245 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010251 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * ires,
10253 sbuf + rkind * i,
10254 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010255 }
10256 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257 /* interleave */
10258 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010259 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010261 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010263 if (--n <= 0)
10264 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010265 memcpy(res + rkind * ires,
10266 sbuf + rkind * i,
10267 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 ires++;
10269 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010270 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010271 memcpy(res + rkind * ires,
10272 sbuf + rkind * i,
10273 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010274 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 }
10276
10277 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010278 unicode_adjust_maxchar(&u);
10279 if (u == NULL)
10280 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010282
10283 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (srelease)
10285 PyMem_FREE(sbuf);
10286 if (release1)
10287 PyMem_FREE(buf1);
10288 if (release2)
10289 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010290 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 if (srelease)
10296 PyMem_FREE(sbuf);
10297 if (release1)
10298 PyMem_FREE(buf1);
10299 if (release2)
10300 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010301 return unicode_result_unchanged(self);
10302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 error:
10304 if (srelease && sbuf)
10305 PyMem_FREE(sbuf);
10306 if (release1 && buf1)
10307 PyMem_FREE(buf1);
10308 if (release2 && buf2)
10309 PyMem_FREE(buf2);
10310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311}
10312
10313/* --- Unicode Object Methods --------------------------------------------- */
10314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010315PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010316 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317\n\
10318Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010319characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
10321static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010322unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010324 if (PyUnicode_READY(self) == -1)
10325 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010326 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010329PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010330 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331\n\
10332Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010333have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334
10335static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010336unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010338 if (PyUnicode_READY(self) == -1)
10339 return NULL;
10340 if (PyUnicode_GET_LENGTH(self) == 0)
10341 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010342 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343}
10344
Benjamin Petersond5890c82012-01-14 13:23:30 -050010345PyDoc_STRVAR(casefold__doc__,
10346 "S.casefold() -> str\n\
10347\n\
10348Return a version of S suitable for caseless comparisons.");
10349
10350static PyObject *
10351unicode_casefold(PyObject *self)
10352{
10353 if (PyUnicode_READY(self) == -1)
10354 return NULL;
10355 if (PyUnicode_IS_ASCII(self))
10356 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010357 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010358}
10359
10360
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010361/* Argument converter. Coerces to a single unicode character */
10362
10363static int
10364convert_uc(PyObject *obj, void *addr)
10365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010368
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369 uniobj = PyUnicode_FromObject(obj);
10370 if (uniobj == NULL) {
10371 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010373 return 0;
10374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010376 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010377 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010378 Py_DECREF(uniobj);
10379 return 0;
10380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010382 Py_DECREF(uniobj);
10383 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010384}
10385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010386PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010387 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010389Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010390done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391
10392static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010393unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010395 Py_ssize_t marg, left;
10396 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 Py_UCS4 fillchar = ' ';
10398
Victor Stinnere9a29352011-10-01 02:14:59 +020010399 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
Benjamin Petersonbac79492012-01-14 13:34:47 -050010402 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403 return NULL;
10404
Victor Stinnerc4b49542011-12-11 22:44:26 +010010405 if (PyUnicode_GET_LENGTH(self) >= width)
10406 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
Victor Stinnerc4b49542011-12-11 22:44:26 +010010408 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409 left = marg / 2 + (marg & width & 1);
10410
Victor Stinner9310abb2011-10-05 00:59:23 +020010411 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412}
10413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414/* This function assumes that str1 and str2 are readied by the caller. */
10415
Marc-André Lemburge5034372000-08-08 08:04:29 +000010416static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010417unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 int kind1, kind2;
10420 void *data1, *data2;
10421 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 kind1 = PyUnicode_KIND(str1);
10424 kind2 = PyUnicode_KIND(str2);
10425 data1 = PyUnicode_DATA(str1);
10426 data2 = PyUnicode_DATA(str2);
10427 len1 = PyUnicode_GET_LENGTH(str1);
10428 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 for (i = 0; i < len1 && i < len2; ++i) {
10431 Py_UCS4 c1, c2;
10432 c1 = PyUnicode_READ(kind1, data1, i);
10433 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010434
10435 if (c1 != c2)
10436 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010437 }
10438
10439 return (len1 < len2) ? -1 : (len1 != len2);
10440}
10441
Alexander Belopolsky40018472011-02-26 01:02:56 +000010442int
10443PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10446 if (PyUnicode_READY(left) == -1 ||
10447 PyUnicode_READY(right) == -1)
10448 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010449 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010451 PyErr_Format(PyExc_TypeError,
10452 "Can't compare %.100s and %.100s",
10453 left->ob_type->tp_name,
10454 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455 return -1;
10456}
10457
Martin v. Löwis5b222132007-06-10 09:51:05 +000010458int
10459PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 Py_ssize_t i;
10462 int kind;
10463 void *data;
10464 Py_UCS4 chr;
10465
Victor Stinner910337b2011-10-03 03:20:16 +020010466 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (PyUnicode_READY(uni) == -1)
10468 return -1;
10469 kind = PyUnicode_KIND(uni);
10470 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010471 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10473 if (chr != str[i])
10474 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010475 /* This check keeps Python strings that end in '\0' from comparing equal
10476 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010478 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010479 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010480 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010481 return 0;
10482}
10483
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010484
Benjamin Peterson29060642009-01-31 22:14:21 +000010485#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010486 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010487
Alexander Belopolsky40018472011-02-26 01:02:56 +000010488PyObject *
10489PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010490{
10491 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010492
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010493 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10494 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 if (PyUnicode_READY(left) == -1 ||
10496 PyUnicode_READY(right) == -1)
10497 return NULL;
10498 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10499 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010500 if (op == Py_EQ) {
10501 Py_INCREF(Py_False);
10502 return Py_False;
10503 }
10504 if (op == Py_NE) {
10505 Py_INCREF(Py_True);
10506 return Py_True;
10507 }
10508 }
10509 if (left == right)
10510 result = 0;
10511 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010512 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010513
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010514 /* Convert the return value to a Boolean */
10515 switch (op) {
10516 case Py_EQ:
10517 v = TEST_COND(result == 0);
10518 break;
10519 case Py_NE:
10520 v = TEST_COND(result != 0);
10521 break;
10522 case Py_LE:
10523 v = TEST_COND(result <= 0);
10524 break;
10525 case Py_GE:
10526 v = TEST_COND(result >= 0);
10527 break;
10528 case Py_LT:
10529 v = TEST_COND(result == -1);
10530 break;
10531 case Py_GT:
10532 v = TEST_COND(result == 1);
10533 break;
10534 default:
10535 PyErr_BadArgument();
10536 return NULL;
10537 }
10538 Py_INCREF(v);
10539 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010540 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010541
Brian Curtindfc80e32011-08-10 20:28:54 -050010542 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010543}
10544
Alexander Belopolsky40018472011-02-26 01:02:56 +000010545int
10546PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010547{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010548 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 int kind1, kind2, kind;
10550 void *buf1, *buf2;
10551 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010552 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010553
10554 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010555 sub = PyUnicode_FromObject(element);
10556 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 PyErr_Format(PyExc_TypeError,
10558 "'in <string>' requires string as left operand, not %s",
10559 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010560 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010561 }
10562
Thomas Wouters477c8d52006-05-27 19:21:47 +000010563 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010564 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010565 Py_DECREF(sub);
10566 return -1;
10567 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010568 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10569 Py_DECREF(sub);
10570 Py_DECREF(str);
10571 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 kind1 = PyUnicode_KIND(str);
10574 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010575 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 buf1 = PyUnicode_DATA(str);
10577 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010578 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010579 if (kind2 > kind) {
10580 Py_DECREF(sub);
10581 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010582 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010583 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010584 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 if (!buf2) {
10587 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010588 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 return -1;
10590 }
10591 len1 = PyUnicode_GET_LENGTH(str);
10592 len2 = PyUnicode_GET_LENGTH(sub);
10593
Benjamin Petersonead6b532011-12-20 17:23:42 -060010594 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 case PyUnicode_1BYTE_KIND:
10596 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10597 break;
10598 case PyUnicode_2BYTE_KIND:
10599 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10600 break;
10601 case PyUnicode_4BYTE_KIND:
10602 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10603 break;
10604 default:
10605 result = -1;
10606 assert(0);
10607 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608
10609 Py_DECREF(str);
10610 Py_DECREF(sub);
10611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (kind2 != kind)
10613 PyMem_Free(buf2);
10614
Guido van Rossum403d68b2000-03-13 15:55:09 +000010615 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010616}
10617
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618/* Concat to string or Unicode object giving a new Unicode object. */
10619
Alexander Belopolsky40018472011-02-26 01:02:56 +000010620PyObject *
10621PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010624 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010625 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626
10627 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010633 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634
10635 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010636 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010637 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010640 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010641 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 }
10644
Victor Stinner488fa492011-12-12 00:01:39 +010010645 u_len = PyUnicode_GET_LENGTH(u);
10646 v_len = PyUnicode_GET_LENGTH(v);
10647 if (u_len > PY_SSIZE_T_MAX - v_len) {
10648 PyErr_SetString(PyExc_OverflowError,
10649 "strings are too large to concat");
10650 goto onError;
10651 }
10652 new_len = u_len + v_len;
10653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010655 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010656 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010659 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010661 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010662 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10663 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 Py_DECREF(u);
10665 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010666 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 Py_XDECREF(u);
10671 Py_XDECREF(v);
10672 return NULL;
10673}
10674
Walter Dörwald1ab83302007-05-18 17:15:44 +000010675void
Victor Stinner23e56682011-10-03 03:54:37 +020010676PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010677{
Victor Stinner23e56682011-10-03 03:54:37 +020010678 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010679 Py_UCS4 maxchar, maxchar2;
10680 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010681
10682 if (p_left == NULL) {
10683 if (!PyErr_Occurred())
10684 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010685 return;
10686 }
Victor Stinner23e56682011-10-03 03:54:37 +020010687 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010688 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010689 if (!PyErr_Occurred())
10690 PyErr_BadInternalCall();
10691 goto error;
10692 }
10693
Benjamin Petersonbac79492012-01-14 13:34:47 -050010694 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010695 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010696 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010697 goto error;
10698
Victor Stinner488fa492011-12-12 00:01:39 +010010699 /* Shortcuts */
10700 if (left == unicode_empty) {
10701 Py_DECREF(left);
10702 Py_INCREF(right);
10703 *p_left = right;
10704 return;
10705 }
10706 if (right == unicode_empty)
10707 return;
10708
10709 left_len = PyUnicode_GET_LENGTH(left);
10710 right_len = PyUnicode_GET_LENGTH(right);
10711 if (left_len > PY_SSIZE_T_MAX - right_len) {
10712 PyErr_SetString(PyExc_OverflowError,
10713 "strings are too large to concat");
10714 goto error;
10715 }
10716 new_len = left_len + right_len;
10717
10718 if (unicode_modifiable(left)
10719 && PyUnicode_CheckExact(right)
10720 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010721 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10722 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010723 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010724 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010725 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10726 {
10727 /* append inplace */
10728 if (unicode_resize(p_left, new_len) != 0) {
10729 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10730 * deallocated so it cannot be put back into
10731 * 'variable'. The MemoryError is raised when there
10732 * is no value in 'variable', which might (very
10733 * remotely) be a cause of incompatibilities.
10734 */
10735 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010736 }
Victor Stinner488fa492011-12-12 00:01:39 +010010737 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010738 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010739 }
Victor Stinner488fa492011-12-12 00:01:39 +010010740 else {
10741 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10742 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010743 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010744
Victor Stinner488fa492011-12-12 00:01:39 +010010745 /* Concat the two Unicode strings */
10746 res = PyUnicode_New(new_len, maxchar);
10747 if (res == NULL)
10748 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010749 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10750 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010751 Py_DECREF(left);
10752 *p_left = res;
10753 }
10754 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010755 return;
10756
10757error:
Victor Stinner488fa492011-12-12 00:01:39 +010010758 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010759}
10760
10761void
10762PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10763{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010764 PyUnicode_Append(pleft, right);
10765 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010766}
10767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010768PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010771Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010772string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010773interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774
10775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010776unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010778 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010779 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010780 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 int kind1, kind2, kind;
10783 void *buf1, *buf2;
10784 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
Jesus Ceaac451502011-04-20 17:09:23 +020010786 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10787 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 kind1 = PyUnicode_KIND(self);
10791 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010792 if (kind2 > kind1)
10793 return PyLong_FromLong(0);
10794 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 buf1 = PyUnicode_DATA(self);
10796 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010798 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (!buf2) {
10800 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 return NULL;
10802 }
10803 len1 = PyUnicode_GET_LENGTH(self);
10804 len2 = PyUnicode_GET_LENGTH(substring);
10805
10806 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010807 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 case PyUnicode_1BYTE_KIND:
10809 iresult = ucs1lib_count(
10810 ((Py_UCS1*)buf1) + start, end - start,
10811 buf2, len2, PY_SSIZE_T_MAX
10812 );
10813 break;
10814 case PyUnicode_2BYTE_KIND:
10815 iresult = ucs2lib_count(
10816 ((Py_UCS2*)buf1) + start, end - start,
10817 buf2, len2, PY_SSIZE_T_MAX
10818 );
10819 break;
10820 case PyUnicode_4BYTE_KIND:
10821 iresult = ucs4lib_count(
10822 ((Py_UCS4*)buf1) + start, end - start,
10823 buf2, len2, PY_SSIZE_T_MAX
10824 );
10825 break;
10826 default:
10827 assert(0); iresult = 0;
10828 }
10829
10830 result = PyLong_FromSsize_t(iresult);
10831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (kind2 != kind)
10833 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010836
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 return result;
10838}
10839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010840PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010841 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010843Encode S using the codec registered for encoding. Default encoding\n\
10844is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010845handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010846a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10847'xmlcharrefreplace' as well as any other name registered with\n\
10848codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
10850static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010851unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010853 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 char *encoding = NULL;
10855 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010856
Benjamin Peterson308d6372009-09-18 21:42:35 +000010857 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10858 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010860 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010861}
10862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010863PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010864 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865\n\
10866Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010867If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
10869static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010870unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 Py_ssize_t i, j, line_pos, src_len, incr;
10873 Py_UCS4 ch;
10874 PyObject *u;
10875 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010878 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
10880 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
Antoine Pitrou22425222011-10-04 19:10:51 +020010883 if (PyUnicode_READY(self) == -1)
10884 return NULL;
10885
Thomas Wouters7e474022000-07-16 12:04:32 +000010886 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 src_len = PyUnicode_GET_LENGTH(self);
10888 i = j = line_pos = 0;
10889 kind = PyUnicode_KIND(self);
10890 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010891 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 for (; i < src_len; i++) {
10893 ch = PyUnicode_READ(kind, src_data, i);
10894 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010895 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010898 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 goto overflow;
10900 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010902 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010906 goto overflow;
10907 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 if (ch == '\n' || ch == '\r')
10910 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010913 if (!found)
10914 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010915
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010917 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 if (!u)
10919 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
Antoine Pitroue71d5742011-10-04 15:55:09 +020010922 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Antoine Pitroue71d5742011-10-04 15:55:09 +020010924 for (; i < src_len; i++) {
10925 ch = PyUnicode_READ(kind, src_data, i);
10926 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 incr = tabsize - (line_pos % tabsize);
10929 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010930 FILL(kind, dest_data, ' ', j, incr);
10931 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010933 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010934 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010935 line_pos++;
10936 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010937 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010938 if (ch == '\n' || ch == '\r')
10939 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010941 }
10942 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010943 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010944
Antoine Pitroue71d5742011-10-04 15:55:09 +020010945 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010946 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948}
10949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010951 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952\n\
10953Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010954such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955arguments start and end are interpreted as in slice notation.\n\
10956\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010957Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010962 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010963 Py_ssize_t start;
10964 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
Jesus Ceaac451502011-04-20 17:09:23 +020010967 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10968 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (PyUnicode_READY(self) == -1)
10972 return NULL;
10973 if (PyUnicode_READY(substring) == -1)
10974 return NULL;
10975
Victor Stinner7931d9a2011-11-04 00:22:48 +010010976 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
10978 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (result == -2)
10981 return NULL;
10982
Christian Heimes217cfd12007-12-02 14:31:20 +000010983 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984}
10985
10986static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010987unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010989 void *data;
10990 enum PyUnicode_Kind kind;
10991 Py_UCS4 ch;
10992 PyObject *res;
10993
10994 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10995 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010997 }
10998 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10999 PyErr_SetString(PyExc_IndexError, "string index out of range");
11000 return NULL;
11001 }
11002 kind = PyUnicode_KIND(self);
11003 data = PyUnicode_DATA(self);
11004 ch = PyUnicode_READ(kind, data, index);
11005 if (ch < 256)
11006 return get_latin1_char(ch);
11007
11008 res = PyUnicode_New(1, ch);
11009 if (res == NULL)
11010 return NULL;
11011 kind = PyUnicode_KIND(res);
11012 data = PyUnicode_DATA(res);
11013 PyUnicode_WRITE(kind, data, 0, ch);
11014 assert(_PyUnicode_CheckConsistency(res, 1));
11015 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016}
11017
Guido van Rossumc2504932007-09-18 19:42:40 +000011018/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011019 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011020static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011021unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022{
Guido van Rossumc2504932007-09-18 19:42:40 +000011023 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011024 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011025
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011026#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011027 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011028#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 if (_PyUnicode_HASH(self) != -1)
11030 return _PyUnicode_HASH(self);
11031 if (PyUnicode_READY(self) == -1)
11032 return -1;
11033 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011034 /*
11035 We make the hash of the empty string be 0, rather than using
11036 (prefix ^ suffix), since this slightly obfuscates the hash secret
11037 */
11038 if (len == 0) {
11039 _PyUnicode_HASH(self) = 0;
11040 return 0;
11041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042
11043 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011044#define HASH(P) \
11045 x ^= (Py_uhash_t) *P << 7; \
11046 while (--len >= 0) \
11047 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048
Georg Brandl2fb477c2012-02-21 00:33:36 +010011049 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 switch (PyUnicode_KIND(self)) {
11051 case PyUnicode_1BYTE_KIND: {
11052 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11053 HASH(c);
11054 break;
11055 }
11056 case PyUnicode_2BYTE_KIND: {
11057 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11058 HASH(s);
11059 break;
11060 }
11061 default: {
11062 Py_UCS4 *l;
11063 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11064 "Impossible switch case in unicode_hash");
11065 l = PyUnicode_4BYTE_DATA(self);
11066 HASH(l);
11067 break;
11068 }
11069 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011070 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11071 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072
Guido van Rossumc2504932007-09-18 19:42:40 +000011073 if (x == -1)
11074 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011076 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011080PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011081 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011083Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011088 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011089 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011090 Py_ssize_t start;
11091 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092
Jesus Ceaac451502011-04-20 17:09:23 +020011093 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11094 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (PyUnicode_READY(self) == -1)
11098 return NULL;
11099 if (PyUnicode_READY(substring) == -1)
11100 return NULL;
11101
Victor Stinner7931d9a2011-11-04 00:22:48 +010011102 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103
11104 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 if (result == -2)
11107 return NULL;
11108
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 if (result < 0) {
11110 PyErr_SetString(PyExc_ValueError, "substring not found");
11111 return NULL;
11112 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011113
Christian Heimes217cfd12007-12-02 14:31:20 +000011114 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115}
11116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011117PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011120Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011121at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
11123static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011124unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 Py_ssize_t i, length;
11127 int kind;
11128 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 int cased;
11130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 if (PyUnicode_READY(self) == -1)
11132 return NULL;
11133 length = PyUnicode_GET_LENGTH(self);
11134 kind = PyUnicode_KIND(self);
11135 data = PyUnicode_DATA(self);
11136
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (length == 1)
11139 return PyBool_FromLong(
11140 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011142 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011145
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 for (i = 0; i < length; i++) {
11148 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011149
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11151 return PyBool_FromLong(0);
11152 else if (!cased && Py_UNICODE_ISLOWER(ch))
11153 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011155 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156}
11157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011159 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011161Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011162at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163
11164static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011165unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 Py_ssize_t i, length;
11168 int kind;
11169 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 int cased;
11171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (PyUnicode_READY(self) == -1)
11173 return NULL;
11174 length = PyUnicode_GET_LENGTH(self);
11175 kind = PyUnicode_KIND(self);
11176 data = PyUnicode_DATA(self);
11177
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (length == 1)
11180 return PyBool_FromLong(
11181 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011183 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011186
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 for (i = 0; i < length; i++) {
11189 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011190
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11192 return PyBool_FromLong(0);
11193 else if (!cased && Py_UNICODE_ISUPPER(ch))
11194 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011196 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197}
11198
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011199PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011202Return True if S is a titlecased string and there is at least one\n\
11203character in S, i.e. upper- and titlecase characters may only\n\
11204follow uncased characters and lowercase characters only cased ones.\n\
11205Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011208unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 Py_ssize_t i, length;
11211 int kind;
11212 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213 int cased, previous_is_cased;
11214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 if (PyUnicode_READY(self) == -1)
11216 return NULL;
11217 length = PyUnicode_GET_LENGTH(self);
11218 kind = PyUnicode_KIND(self);
11219 data = PyUnicode_DATA(self);
11220
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 if (length == 1) {
11223 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11224 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11225 (Py_UNICODE_ISUPPER(ch) != 0));
11226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011228 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011231
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 cased = 0;
11233 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 for (i = 0; i < length; i++) {
11235 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011236
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11238 if (previous_is_cased)
11239 return PyBool_FromLong(0);
11240 previous_is_cased = 1;
11241 cased = 1;
11242 }
11243 else if (Py_UNICODE_ISLOWER(ch)) {
11244 if (!previous_is_cased)
11245 return PyBool_FromLong(0);
11246 previous_is_cased = 1;
11247 cased = 1;
11248 }
11249 else
11250 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011252 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253}
11254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011255PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011258Return True if all characters in S are whitespace\n\
11259and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
11261static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011262unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 Py_ssize_t i, length;
11265 int kind;
11266 void *data;
11267
11268 if (PyUnicode_READY(self) == -1)
11269 return NULL;
11270 length = PyUnicode_GET_LENGTH(self);
11271 kind = PyUnicode_KIND(self);
11272 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (length == 1)
11276 return PyBool_FromLong(
11277 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011279 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 for (i = 0; i < length; i++) {
11284 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011285 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011288 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289}
11290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011291PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011293\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011294Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011295and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011296
11297static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011298unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 Py_ssize_t i, length;
11301 int kind;
11302 void *data;
11303
11304 if (PyUnicode_READY(self) == -1)
11305 return NULL;
11306 length = PyUnicode_GET_LENGTH(self);
11307 kind = PyUnicode_KIND(self);
11308 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011310 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (length == 1)
11312 return PyBool_FromLong(
11313 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011314
11315 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 for (i = 0; i < length; i++) {
11320 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011322 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011323 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011324}
11325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011328\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011329Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011331
11332static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011333unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 int kind;
11336 void *data;
11337 Py_ssize_t len, i;
11338
11339 if (PyUnicode_READY(self) == -1)
11340 return NULL;
11341
11342 kind = PyUnicode_KIND(self);
11343 data = PyUnicode_DATA(self);
11344 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011345
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011346 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 if (len == 1) {
11348 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11349 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11350 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011351
11352 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 for (i = 0; i < len; i++) {
11357 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011358 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011360 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011361 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011362}
11363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011367Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011368False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
11370static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011371unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 Py_ssize_t i, length;
11374 int kind;
11375 void *data;
11376
11377 if (PyUnicode_READY(self) == -1)
11378 return NULL;
11379 length = PyUnicode_GET_LENGTH(self);
11380 kind = PyUnicode_KIND(self);
11381 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (length == 1)
11385 return PyBool_FromLong(
11386 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011388 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 for (i = 0; i < length; i++) {
11393 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011396 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011402Return True if all characters in S are digits\n\
11403and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
11405static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011406unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 Py_ssize_t i, length;
11409 int kind;
11410 void *data;
11411
11412 if (PyUnicode_READY(self) == -1)
11413 return NULL;
11414 length = PyUnicode_GET_LENGTH(self);
11415 kind = PyUnicode_KIND(self);
11416 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 if (length == 1) {
11420 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11421 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011424 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 for (i = 0; i < length; i++) {
11429 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011432 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433}
11434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011435PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011438Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011442unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 Py_ssize_t i, length;
11445 int kind;
11446 void *data;
11447
11448 if (PyUnicode_READY(self) == -1)
11449 return NULL;
11450 length = PyUnicode_GET_LENGTH(self);
11451 kind = PyUnicode_KIND(self);
11452 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (length == 1)
11456 return PyBool_FromLong(
11457 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011459 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 for (i = 0; i < length; i++) {
11464 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011467 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468}
11469
Martin v. Löwis47383402007-08-15 07:32:56 +000011470int
11471PyUnicode_IsIdentifier(PyObject *self)
11472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 int kind;
11474 void *data;
11475 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011476 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 if (PyUnicode_READY(self) == -1) {
11479 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 }
11482
11483 /* Special case for empty strings */
11484 if (PyUnicode_GET_LENGTH(self) == 0)
11485 return 0;
11486 kind = PyUnicode_KIND(self);
11487 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011488
11489 /* PEP 3131 says that the first character must be in
11490 XID_Start and subsequent characters in XID_Continue,
11491 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011492 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011493 letters, digits, underscore). However, given the current
11494 definition of XID_Start and XID_Continue, it is sufficient
11495 to check just for these, except that _ must be allowed
11496 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011498 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011499 return 0;
11500
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011501 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011504 return 1;
11505}
11506
11507PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011509\n\
11510Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011511to the language definition.\n\
11512\n\
11513Use keyword.iskeyword() to test for reserved identifiers\n\
11514such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011515
11516static PyObject*
11517unicode_isidentifier(PyObject *self)
11518{
11519 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11520}
11521
Georg Brandl559e5d72008-06-11 18:37:52 +000011522PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011524\n\
11525Return True if all characters in S are considered\n\
11526printable in repr() or S is empty, False otherwise.");
11527
11528static PyObject*
11529unicode_isprintable(PyObject *self)
11530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 Py_ssize_t i, length;
11532 int kind;
11533 void *data;
11534
11535 if (PyUnicode_READY(self) == -1)
11536 return NULL;
11537 length = PyUnicode_GET_LENGTH(self);
11538 kind = PyUnicode_KIND(self);
11539 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011540
11541 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (length == 1)
11543 return PyBool_FromLong(
11544 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 for (i = 0; i < length; i++) {
11547 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011548 Py_RETURN_FALSE;
11549 }
11550 }
11551 Py_RETURN_TRUE;
11552}
11553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011554PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011555 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556\n\
11557Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011558iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
11560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011561unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011563 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564}
11565
Martin v. Löwis18e16552006-02-15 17:27:45 +000011566static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011567unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 if (PyUnicode_READY(self) == -1)
11570 return -1;
11571 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572}
11573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011574PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011577Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011578done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
11580static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011581unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011583 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 Py_UCS4 fillchar = ' ';
11585
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011586 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587 return NULL;
11588
Benjamin Petersonbac79492012-01-14 13:34:47 -050011589 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591
Victor Stinnerc4b49542011-12-11 22:44:26 +010011592 if (PyUnicode_GET_LENGTH(self) >= width)
11593 return unicode_result_unchanged(self);
11594
11595 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596}
11597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011601Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
11603static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011604unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011606 if (PyUnicode_READY(self) == -1)
11607 return NULL;
11608 if (PyUnicode_IS_ASCII(self))
11609 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011610 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011613#define LEFTSTRIP 0
11614#define RIGHTSTRIP 1
11615#define BOTHSTRIP 2
11616
11617/* Arrays indexed by above */
11618static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11619
11620#define STRIPNAME(i) (stripformat[i]+3)
11621
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011622/* externally visible for str.strip(unicode) */
11623PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011624_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 void *data;
11627 int kind;
11628 Py_ssize_t i, j, len;
11629 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11632 return NULL;
11633
11634 kind = PyUnicode_KIND(self);
11635 data = PyUnicode_DATA(self);
11636 len = PyUnicode_GET_LENGTH(self);
11637 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11638 PyUnicode_DATA(sepobj),
11639 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011640
Benjamin Peterson14339b62009-01-31 16:36:08 +000011641 i = 0;
11642 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 while (i < len &&
11644 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 i++;
11646 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011647 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011648
Benjamin Peterson14339b62009-01-31 16:36:08 +000011649 j = len;
11650 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 do {
11652 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 } while (j >= i &&
11654 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011656 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657
Victor Stinner7931d9a2011-11-04 00:22:48 +010011658 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659}
11660
11661PyObject*
11662PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11663{
11664 unsigned char *data;
11665 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011666 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667
Victor Stinnerde636f32011-10-01 03:55:54 +020011668 if (PyUnicode_READY(self) == -1)
11669 return NULL;
11670
Victor Stinner684d5fd2012-05-03 02:32:34 +020011671 length = PyUnicode_GET_LENGTH(self);
11672 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011673
Victor Stinner684d5fd2012-05-03 02:32:34 +020011674 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011675 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676
Victor Stinnerde636f32011-10-01 03:55:54 +020011677 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011678 PyErr_SetString(PyExc_IndexError, "string index out of range");
11679 return NULL;
11680 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011681 if (start >= length || end < start)
11682 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011683
Victor Stinner684d5fd2012-05-03 02:32:34 +020011684 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011685 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011686 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011687 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011688 }
11689 else {
11690 kind = PyUnicode_KIND(self);
11691 data = PyUnicode_1BYTE_DATA(self);
11692 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011693 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011694 length);
11695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
11698static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011699do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 int kind;
11702 void *data;
11703 Py_ssize_t len, i, j;
11704
11705 if (PyUnicode_READY(self) == -1)
11706 return NULL;
11707
11708 kind = PyUnicode_KIND(self);
11709 data = PyUnicode_DATA(self);
11710 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011711
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 i = 0;
11713 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715 i++;
11716 }
11717 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 j = len;
11720 if (striptype != LEFTSTRIP) {
11721 do {
11722 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 j++;
11725 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
Victor Stinner7931d9a2011-11-04 00:22:48 +010011727 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011730
11731static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011733{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011734 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011735
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11737 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738
Benjamin Peterson14339b62009-01-31 16:36:08 +000011739 if (sep != NULL && sep != Py_None) {
11740 if (PyUnicode_Check(sep))
11741 return _PyUnicode_XStrip(self, striptype, sep);
11742 else {
11743 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 "%s arg must be None or str",
11745 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011746 return NULL;
11747 }
11748 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011749
Benjamin Peterson14339b62009-01-31 16:36:08 +000011750 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011751}
11752
11753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011754PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011756\n\
11757Return a copy of the string S with leading and trailing\n\
11758whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011759If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011760
11761static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011762unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011763{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011764 if (PyTuple_GET_SIZE(args) == 0)
11765 return do_strip(self, BOTHSTRIP); /* Common case */
11766 else
11767 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011768}
11769
11770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011771PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011773\n\
11774Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011775If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776
11777static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011778unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 if (PyTuple_GET_SIZE(args) == 0)
11781 return do_strip(self, LEFTSTRIP); /* Common case */
11782 else
11783 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011784}
11785
11786
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011787PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011789\n\
11790Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011791If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011792
11793static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011794unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011796 if (PyTuple_GET_SIZE(args) == 0)
11797 return do_strip(self, RIGHTSTRIP); /* Common case */
11798 else
11799 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800}
11801
11802
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011806 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Serhiy Storchaka05997252013-01-26 12:14:02 +020011809 if (len < 1)
11810 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
Victor Stinnerc4b49542011-12-11 22:44:26 +010011812 /* no repeat, return original string */
11813 if (len == 1)
11814 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011815
Benjamin Petersonbac79492012-01-14 13:34:47 -050011816 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 return NULL;
11818
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011819 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011820 PyErr_SetString(PyExc_OverflowError,
11821 "repeated string is too long");
11822 return NULL;
11823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011825
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011826 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 if (!u)
11828 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011829 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (PyUnicode_GET_LENGTH(str) == 1) {
11832 const int kind = PyUnicode_KIND(str);
11833 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011834 if (kind == PyUnicode_1BYTE_KIND) {
11835 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011836 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011837 }
11838 else if (kind == PyUnicode_2BYTE_KIND) {
11839 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011840 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011841 ucs2[n] = fill_char;
11842 } else {
11843 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11844 assert(kind == PyUnicode_4BYTE_KIND);
11845 for (n = 0; n < len; ++n)
11846 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 }
11849 else {
11850 /* number of characters copied this far */
11851 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011852 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 char *to = (char *) PyUnicode_DATA(u);
11854 Py_MEMCPY(to, PyUnicode_DATA(str),
11855 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 n = (done <= nchars-done) ? done : nchars-done;
11858 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011859 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 }
11862
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011863 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011864 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865}
11866
Alexander Belopolsky40018472011-02-26 01:02:56 +000011867PyObject *
11868PyUnicode_Replace(PyObject *obj,
11869 PyObject *subobj,
11870 PyObject *replobj,
11871 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872{
11873 PyObject *self;
11874 PyObject *str1;
11875 PyObject *str2;
11876 PyObject *result;
11877
11878 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011879 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011882 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 Py_DECREF(self);
11884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 }
11886 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011887 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 Py_DECREF(self);
11889 Py_DECREF(str1);
11890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011892 if (PyUnicode_READY(self) == -1 ||
11893 PyUnicode_READY(str1) == -1 ||
11894 PyUnicode_READY(str2) == -1)
11895 result = NULL;
11896 else
11897 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 Py_DECREF(self);
11899 Py_DECREF(str1);
11900 Py_DECREF(str2);
11901 return result;
11902}
11903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011904PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011905 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906\n\
11907Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011908old replaced by new. If the optional argument count is\n\
11909given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
11911static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 PyObject *str1;
11915 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011916 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 PyObject *result;
11918
Martin v. Löwis18e16552006-02-15 17:27:45 +000011919 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011921 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011924 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 return NULL;
11926 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011927 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 Py_DECREF(str1);
11929 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011930 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011931 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11932 result = NULL;
11933 else
11934 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
11936 Py_DECREF(str1);
11937 Py_DECREF(str2);
11938 return result;
11939}
11940
Alexander Belopolsky40018472011-02-26 01:02:56 +000011941static PyObject *
11942unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011944 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 Py_ssize_t isize;
11946 Py_ssize_t osize, squote, dquote, i, o;
11947 Py_UCS4 max, quote;
11948 int ikind, okind;
11949 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011952 return NULL;
11953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 isize = PyUnicode_GET_LENGTH(unicode);
11955 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 /* Compute length of output, quote characters, and
11958 maximum character */
11959 osize = 2; /* quotes */
11960 max = 127;
11961 squote = dquote = 0;
11962 ikind = PyUnicode_KIND(unicode);
11963 for (i = 0; i < isize; i++) {
11964 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11965 switch (ch) {
11966 case '\'': squote++; osize++; break;
11967 case '"': dquote++; osize++; break;
11968 case '\\': case '\t': case '\r': case '\n':
11969 osize += 2; break;
11970 default:
11971 /* Fast-path ASCII */
11972 if (ch < ' ' || ch == 0x7f)
11973 osize += 4; /* \xHH */
11974 else if (ch < 0x7f)
11975 osize++;
11976 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11977 osize++;
11978 max = ch > max ? ch : max;
11979 }
11980 else if (ch < 0x100)
11981 osize += 4; /* \xHH */
11982 else if (ch < 0x10000)
11983 osize += 6; /* \uHHHH */
11984 else
11985 osize += 10; /* \uHHHHHHHH */
11986 }
11987 }
11988
11989 quote = '\'';
11990 if (squote) {
11991 if (dquote)
11992 /* Both squote and dquote present. Use squote,
11993 and escape them */
11994 osize += squote;
11995 else
11996 quote = '"';
11997 }
11998
11999 repr = PyUnicode_New(osize, max);
12000 if (repr == NULL)
12001 return NULL;
12002 okind = PyUnicode_KIND(repr);
12003 odata = PyUnicode_DATA(repr);
12004
12005 PyUnicode_WRITE(okind, odata, 0, quote);
12006 PyUnicode_WRITE(okind, odata, osize-1, quote);
12007
12008 for (i = 0, o = 1; i < isize; i++) {
12009 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010
12011 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if ((ch == quote) || (ch == '\\')) {
12013 PyUnicode_WRITE(okind, odata, o++, '\\');
12014 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012015 continue;
12016 }
12017
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012019 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 PyUnicode_WRITE(okind, odata, o++, '\\');
12021 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012022 }
12023 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 PyUnicode_WRITE(okind, odata, o++, '\\');
12025 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012026 }
12027 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 PyUnicode_WRITE(okind, odata, o++, '\\');
12029 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012030 }
12031
12032 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012033 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 PyUnicode_WRITE(okind, odata, o++, '\\');
12035 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012038 }
12039
Georg Brandl559e5d72008-06-11 18:37:52 +000012040 /* Copy ASCII characters as-is */
12041 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012043 }
12044
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012046 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012047 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012048 (categories Z* and C* except ASCII space)
12049 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012051 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012052 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012057 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012058 /* Map 16-bit characters to '\uxxxx' */
12059 else if (ch <= 0xffff) {
12060 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012066 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012067 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012068 PyUnicode_WRITE(okind, odata, o++, 'U');
12069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12076 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012077 }
12078 }
12079 /* Copy characters as-is */
12080 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012082 }
12083 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012086 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012087 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088}
12089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012090PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092\n\
12093Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012094such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095arguments start and end are interpreted as in slice notation.\n\
12096\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012097Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012102 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012103 Py_ssize_t start;
12104 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012105 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
Jesus Ceaac451502011-04-20 17:09:23 +020012107 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12108 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (PyUnicode_READY(self) == -1)
12112 return NULL;
12113 if (PyUnicode_READY(substring) == -1)
12114 return NULL;
12115
Victor Stinner7931d9a2011-11-04 00:22:48 +010012116 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117
12118 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 if (result == -2)
12121 return NULL;
12122
Christian Heimes217cfd12007-12-02 14:31:20 +000012123 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124}
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012129Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
12131static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012134 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012135 Py_ssize_t start;
12136 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012137 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
Jesus Ceaac451502011-04-20 17:09:23 +020012139 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12140 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (PyUnicode_READY(self) == -1)
12144 return NULL;
12145 if (PyUnicode_READY(substring) == -1)
12146 return NULL;
12147
Victor Stinner7931d9a2011-11-04 00:22:48 +010012148 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149
12150 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (result == -2)
12153 return NULL;
12154
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 if (result < 0) {
12156 PyErr_SetString(PyExc_ValueError, "substring not found");
12157 return NULL;
12158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159
Christian Heimes217cfd12007-12-02 14:31:20 +000012160 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161}
12162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012163PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012166Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012167done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168
12169static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012170unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012172 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_UCS4 fillchar = ' ';
12174
Victor Stinnere9a29352011-10-01 02:14:59 +020012175 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012177
Benjamin Petersonbac79492012-01-14 13:34:47 -050012178 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179 return NULL;
12180
Victor Stinnerc4b49542011-12-11 22:44:26 +010012181 if (PyUnicode_GET_LENGTH(self) >= width)
12182 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
Victor Stinnerc4b49542011-12-11 22:44:26 +010012184 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185}
12186
Alexander Belopolsky40018472011-02-26 01:02:56 +000012187PyObject *
12188PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189{
12190 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012191
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192 s = PyUnicode_FromObject(s);
12193 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 if (sep != NULL) {
12196 sep = PyUnicode_FromObject(sep);
12197 if (sep == NULL) {
12198 Py_DECREF(s);
12199 return NULL;
12200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 }
12202
Victor Stinner9310abb2011-10-05 00:59:23 +020012203 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204
12205 Py_DECREF(s);
12206 Py_XDECREF(sep);
12207 return result;
12208}
12209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012210PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012211 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212\n\
12213Return a list of the words in S, using sep as the\n\
12214delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012215splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012216whitespace string is a separator and empty strings are\n\
12217removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
12219static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012220unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012222 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012224 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012226 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12227 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 return NULL;
12229
12230 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012233 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012235 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236}
12237
Thomas Wouters477c8d52006-05-27 19:21:47 +000012238PyObject *
12239PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12240{
12241 PyObject* str_obj;
12242 PyObject* sep_obj;
12243 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 int kind1, kind2, kind;
12245 void *buf1 = NULL, *buf2 = NULL;
12246 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012247
12248 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012249 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012252 if (!sep_obj) {
12253 Py_DECREF(str_obj);
12254 return NULL;
12255 }
12256 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12257 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258 Py_DECREF(str_obj);
12259 return NULL;
12260 }
12261
Victor Stinner14f8f022011-10-05 20:58:25 +020012262 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012264 kind = Py_MAX(kind1, kind2);
12265 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012267 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (!buf1)
12269 goto onError;
12270 buf2 = PyUnicode_DATA(sep_obj);
12271 if (kind2 != kind)
12272 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12273 if (!buf2)
12274 goto onError;
12275 len1 = PyUnicode_GET_LENGTH(str_obj);
12276 len2 = PyUnicode_GET_LENGTH(sep_obj);
12277
Benjamin Petersonead6b532011-12-20 17:23:42 -060012278 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012280 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12281 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12282 else
12283 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 break;
12285 case PyUnicode_2BYTE_KIND:
12286 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12287 break;
12288 case PyUnicode_4BYTE_KIND:
12289 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12290 break;
12291 default:
12292 assert(0);
12293 out = 0;
12294 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295
12296 Py_DECREF(sep_obj);
12297 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 if (kind1 != kind)
12299 PyMem_Free(buf1);
12300 if (kind2 != kind)
12301 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012302
12303 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 onError:
12305 Py_DECREF(sep_obj);
12306 Py_DECREF(str_obj);
12307 if (kind1 != kind && buf1)
12308 PyMem_Free(buf1);
12309 if (kind2 != kind && buf2)
12310 PyMem_Free(buf2);
12311 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012312}
12313
12314
12315PyObject *
12316PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12317{
12318 PyObject* str_obj;
12319 PyObject* sep_obj;
12320 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 int kind1, kind2, kind;
12322 void *buf1 = NULL, *buf2 = NULL;
12323 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012324
12325 str_obj = PyUnicode_FromObject(str_in);
12326 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012328 sep_obj = PyUnicode_FromObject(sep_in);
12329 if (!sep_obj) {
12330 Py_DECREF(str_obj);
12331 return NULL;
12332 }
12333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 kind1 = PyUnicode_KIND(str_in);
12335 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012336 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 buf1 = PyUnicode_DATA(str_in);
12338 if (kind1 != kind)
12339 buf1 = _PyUnicode_AsKind(str_in, kind);
12340 if (!buf1)
12341 goto onError;
12342 buf2 = PyUnicode_DATA(sep_obj);
12343 if (kind2 != kind)
12344 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12345 if (!buf2)
12346 goto onError;
12347 len1 = PyUnicode_GET_LENGTH(str_obj);
12348 len2 = PyUnicode_GET_LENGTH(sep_obj);
12349
Benjamin Petersonead6b532011-12-20 17:23:42 -060012350 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012352 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12353 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12354 else
12355 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 break;
12357 case PyUnicode_2BYTE_KIND:
12358 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12359 break;
12360 case PyUnicode_4BYTE_KIND:
12361 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12362 break;
12363 default:
12364 assert(0);
12365 out = 0;
12366 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012367
12368 Py_DECREF(sep_obj);
12369 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 if (kind1 != kind)
12371 PyMem_Free(buf1);
12372 if (kind2 != kind)
12373 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012374
12375 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 onError:
12377 Py_DECREF(sep_obj);
12378 Py_DECREF(str_obj);
12379 if (kind1 != kind && buf1)
12380 PyMem_Free(buf1);
12381 if (kind2 != kind && buf2)
12382 PyMem_Free(buf2);
12383 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012384}
12385
12386PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012388\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012389Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012391found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392
12393static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012394unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395{
Victor Stinner9310abb2011-10-05 00:59:23 +020012396 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012397}
12398
12399PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012400 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012401\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012402Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012404separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012405
12406static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012407unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012408{
Victor Stinner9310abb2011-10-05 00:59:23 +020012409 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012410}
12411
Alexander Belopolsky40018472011-02-26 01:02:56 +000012412PyObject *
12413PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012414{
12415 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012416
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012417 s = PyUnicode_FromObject(s);
12418 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 if (sep != NULL) {
12421 sep = PyUnicode_FromObject(sep);
12422 if (sep == NULL) {
12423 Py_DECREF(s);
12424 return NULL;
12425 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012426 }
12427
Victor Stinner9310abb2011-10-05 00:59:23 +020012428 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012429
12430 Py_DECREF(s);
12431 Py_XDECREF(sep);
12432 return result;
12433}
12434
12435PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012436 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437\n\
12438Return a list of the words in S, using sep as the\n\
12439delimiter string, starting at the end of the string and\n\
12440working to the front. If maxsplit is given, at most maxsplit\n\
12441splits are done. If sep is not specified, any whitespace string\n\
12442is a separator.");
12443
12444static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012445unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012446{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012447 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012448 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012449 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012450
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012451 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12452 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012453 return NULL;
12454
12455 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012456 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012457 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012458 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012459 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012460 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012461}
12462
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012463PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465\n\
12466Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012467Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012468is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469
12470static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012471unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012473 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012474 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012476 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12477 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478 return NULL;
12479
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012480 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481}
12482
12483static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012484PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012486 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487}
12488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012489PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491\n\
12492Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012493and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494
12495static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012496unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012498 if (PyUnicode_READY(self) == -1)
12499 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012500 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501}
12502
Georg Brandlceee0772007-11-27 23:48:05 +000012503PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012505\n\
12506Return a translation table usable for str.translate().\n\
12507If there is only one argument, it must be a dictionary mapping Unicode\n\
12508ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012509Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012510If there are two arguments, they must be strings of equal length, and\n\
12511in the resulting dictionary, each character in x will be mapped to the\n\
12512character at the same position in y. If there is a third argument, it\n\
12513must be a string, whose characters will be mapped to None in the result.");
12514
12515static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012516unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012517{
12518 PyObject *x, *y = NULL, *z = NULL;
12519 PyObject *new = NULL, *key, *value;
12520 Py_ssize_t i = 0;
12521 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012522
Georg Brandlceee0772007-11-27 23:48:05 +000012523 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12524 return NULL;
12525 new = PyDict_New();
12526 if (!new)
12527 return NULL;
12528 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 int x_kind, y_kind, z_kind;
12530 void *x_data, *y_data, *z_data;
12531
Georg Brandlceee0772007-11-27 23:48:05 +000012532 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012533 if (!PyUnicode_Check(x)) {
12534 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12535 "be a string if there is a second argument");
12536 goto err;
12537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012539 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12540 "arguments must have equal length");
12541 goto err;
12542 }
12543 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 x_kind = PyUnicode_KIND(x);
12545 y_kind = PyUnicode_KIND(y);
12546 x_data = PyUnicode_DATA(x);
12547 y_data = PyUnicode_DATA(y);
12548 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12549 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012550 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012551 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012552 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012553 if (!value) {
12554 Py_DECREF(key);
12555 goto err;
12556 }
Georg Brandlceee0772007-11-27 23:48:05 +000012557 res = PyDict_SetItem(new, key, value);
12558 Py_DECREF(key);
12559 Py_DECREF(value);
12560 if (res < 0)
12561 goto err;
12562 }
12563 /* create entries for deleting chars in z */
12564 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 z_kind = PyUnicode_KIND(z);
12566 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012567 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012569 if (!key)
12570 goto err;
12571 res = PyDict_SetItem(new, key, Py_None);
12572 Py_DECREF(key);
12573 if (res < 0)
12574 goto err;
12575 }
12576 }
12577 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 int kind;
12579 void *data;
12580
Georg Brandlceee0772007-11-27 23:48:05 +000012581 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012582 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012583 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12584 "to maketrans it must be a dict");
12585 goto err;
12586 }
12587 /* copy entries into the new dict, converting string keys to int keys */
12588 while (PyDict_Next(x, &i, &key, &value)) {
12589 if (PyUnicode_Check(key)) {
12590 /* convert string keys to integer keys */
12591 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012592 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012593 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12594 "table must be of length 1");
12595 goto err;
12596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 kind = PyUnicode_KIND(key);
12598 data = PyUnicode_DATA(key);
12599 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012600 if (!newkey)
12601 goto err;
12602 res = PyDict_SetItem(new, newkey, value);
12603 Py_DECREF(newkey);
12604 if (res < 0)
12605 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012606 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012607 /* just keep integer keys */
12608 if (PyDict_SetItem(new, key, value) < 0)
12609 goto err;
12610 } else {
12611 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12612 "be strings or integers");
12613 goto err;
12614 }
12615 }
12616 }
12617 return new;
12618 err:
12619 Py_DECREF(new);
12620 return NULL;
12621}
12622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012623PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625\n\
12626Return a copy of the string S, where all characters have been mapped\n\
12627through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012628Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012629Unmapped characters are left untouched. Characters mapped to None\n\
12630are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
12632static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636}
12637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012638PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012641Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642
12643static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012644unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012646 if (PyUnicode_READY(self) == -1)
12647 return NULL;
12648 if (PyUnicode_IS_ASCII(self))
12649 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012650 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651}
12652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012653PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012656Pad a numeric string S with zeros on the left, to fill a field\n\
12657of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658
12659static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012660unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012662 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012663 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012664 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 int kind;
12666 void *data;
12667 Py_UCS4 chr;
12668
Martin v. Löwis18e16552006-02-15 17:27:45 +000012669 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 return NULL;
12671
Benjamin Petersonbac79492012-01-14 13:34:47 -050012672 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674
Victor Stinnerc4b49542011-12-11 22:44:26 +010012675 if (PyUnicode_GET_LENGTH(self) >= width)
12676 return unicode_result_unchanged(self);
12677
12678 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
12680 u = pad(self, fill, 0, '0');
12681
Walter Dörwald068325e2002-04-15 13:36:47 +000012682 if (u == NULL)
12683 return NULL;
12684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 kind = PyUnicode_KIND(u);
12686 data = PyUnicode_DATA(u);
12687 chr = PyUnicode_READ(kind, data, fill);
12688
12689 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 PyUnicode_WRITE(kind, data, 0, chr);
12692 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693 }
12694
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012695 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012696 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698
12699#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012700static PyObject *
12701unicode__decimal2ascii(PyObject *self)
12702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012704}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705#endif
12706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012707PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012710Return True if S starts with the specified prefix, False otherwise.\n\
12711With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012712With optional end, stop comparing S at that position.\n\
12713prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
12715static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012716unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012719 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012720 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012721 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012722 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012723 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724
Jesus Ceaac451502011-04-20 17:09:23 +020012725 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012727 if (PyTuple_Check(subobj)) {
12728 Py_ssize_t i;
12729 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012730 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012731 if (substring == NULL)
12732 return NULL;
12733 result = tailmatch(self, substring, start, end, -1);
12734 Py_DECREF(substring);
12735 if (result) {
12736 Py_RETURN_TRUE;
12737 }
12738 }
12739 /* nothing matched */
12740 Py_RETURN_FALSE;
12741 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012742 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012743 if (substring == NULL) {
12744 if (PyErr_ExceptionMatches(PyExc_TypeError))
12745 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12746 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012748 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012749 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012751 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
12753
12754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012755PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012758Return True if S ends with the specified suffix, False otherwise.\n\
12759With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760With optional end, stop comparing S at that position.\n\
12761suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762
12763static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012764unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012768 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012769 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012770 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012771 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772
Jesus Ceaac451502011-04-20 17:09:23 +020012773 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012775 if (PyTuple_Check(subobj)) {
12776 Py_ssize_t i;
12777 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012778 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012780 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782 result = tailmatch(self, substring, start, end, +1);
12783 Py_DECREF(substring);
12784 if (result) {
12785 Py_RETURN_TRUE;
12786 }
12787 }
12788 Py_RETURN_FALSE;
12789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012790 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012791 if (substring == NULL) {
12792 if (PyErr_ExceptionMatches(PyExc_TypeError))
12793 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12794 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012796 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012799 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800}
12801
Victor Stinner202fdca2012-05-07 12:47:02 +020012802Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012803_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012804{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012805 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012806 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12807 writer->data = PyUnicode_DATA(writer->buffer);
12808 writer->kind = PyUnicode_KIND(writer->buffer);
12809}
12810
Victor Stinnerd3f08822012-05-29 12:57:52 +020012811void
12812_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012813{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012814 memset(writer, 0, sizeof(*writer));
12815#ifdef Py_DEBUG
12816 writer->kind = 5; /* invalid kind */
12817#endif
12818 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012819 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012820}
12821
Victor Stinnerd3f08822012-05-29 12:57:52 +020012822int
12823_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12824 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012825{
12826 Py_ssize_t newlen;
12827 PyObject *newbuffer;
12828
Victor Stinnerd3f08822012-05-29 12:57:52 +020012829 assert(length > 0);
12830
Victor Stinner202fdca2012-05-07 12:47:02 +020012831 if (length > PY_SSIZE_T_MAX - writer->pos) {
12832 PyErr_NoMemory();
12833 return -1;
12834 }
12835 newlen = writer->pos + length;
12836
Victor Stinnerd3f08822012-05-29 12:57:52 +020012837 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012838 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012839 /* overallocate 25% to limit the number of resize */
12840 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12841 newlen += newlen / 4;
12842 if (newlen < writer->min_length)
12843 newlen = writer->min_length;
12844 }
12845 writer->buffer = PyUnicode_New(newlen, maxchar);
12846 if (writer->buffer == NULL)
12847 return -1;
12848 _PyUnicodeWriter_Update(writer);
12849 return 0;
12850 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012851
Victor Stinnerd3f08822012-05-29 12:57:52 +020012852 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012853 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012854 /* overallocate 25% to limit the number of resize */
12855 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12856 newlen += newlen / 4;
12857 if (newlen < writer->min_length)
12858 newlen = writer->min_length;
12859 }
12860
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012861 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012862 /* resize + widen */
12863 newbuffer = PyUnicode_New(newlen, maxchar);
12864 if (newbuffer == NULL)
12865 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012866 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12867 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012868 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012869 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012870 }
12871 else {
12872 newbuffer = resize_compact(writer->buffer, newlen);
12873 if (newbuffer == NULL)
12874 return -1;
12875 }
12876 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012877 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012878 }
12879 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012880 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012881 newbuffer = PyUnicode_New(writer->size, maxchar);
12882 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012883 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012884 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12885 writer->buffer, 0, writer->pos);
12886 Py_DECREF(writer->buffer);
12887 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012888 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012889 }
12890 return 0;
12891}
12892
Victor Stinnerd3f08822012-05-29 12:57:52 +020012893int
12894_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12895{
12896 Py_UCS4 maxchar;
12897 Py_ssize_t len;
12898
12899 if (PyUnicode_READY(str) == -1)
12900 return -1;
12901 len = PyUnicode_GET_LENGTH(str);
12902 if (len == 0)
12903 return 0;
12904 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12905 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012906 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012907 Py_INCREF(str);
12908 writer->buffer = str;
12909 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012910 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012911 writer->size = 0;
12912 writer->pos += len;
12913 return 0;
12914 }
12915 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12916 return -1;
12917 }
12918 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12919 str, 0, len);
12920 writer->pos += len;
12921 return 0;
12922}
12923
12924PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012925_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012926{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012927 if (writer->pos == 0) {
12928 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012929 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012930 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012931 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012932 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12933 return writer->buffer;
12934 }
12935 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12936 PyObject *newbuffer;
12937 newbuffer = resize_compact(writer->buffer, writer->pos);
12938 if (newbuffer == NULL) {
12939 Py_DECREF(writer->buffer);
12940 return NULL;
12941 }
12942 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012943 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012944 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010012945 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012946}
12947
Victor Stinnerd3f08822012-05-29 12:57:52 +020012948void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012949_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012950{
12951 Py_CLEAR(writer->buffer);
12952}
12953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012955
12956PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012958\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012959Return a formatted version of S, using substitutions from args and kwargs.\n\
12960The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012961
Eric Smith27bbca62010-11-04 17:06:58 +000012962PyDoc_STRVAR(format_map__doc__,
12963 "S.format_map(mapping) -> str\n\
12964\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012965Return a formatted version of S, using substitutions from mapping.\n\
12966The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012967
Eric Smith4a7d76d2008-05-30 18:10:19 +000012968static PyObject *
12969unicode__format__(PyObject* self, PyObject* args)
12970{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012971 PyObject *format_spec;
12972 _PyUnicodeWriter writer;
12973 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012974
12975 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12976 return NULL;
12977
Victor Stinnerd3f08822012-05-29 12:57:52 +020012978 if (PyUnicode_READY(self) == -1)
12979 return NULL;
12980 _PyUnicodeWriter_Init(&writer, 0);
12981 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12982 self, format_spec, 0,
12983 PyUnicode_GET_LENGTH(format_spec));
12984 if (ret == -1) {
12985 _PyUnicodeWriter_Dealloc(&writer);
12986 return NULL;
12987 }
12988 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012989}
12990
Eric Smith8c663262007-08-25 02:26:07 +000012991PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012993\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012994Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012995
12996static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012997unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 Py_ssize_t size;
13000
13001 /* If it's a compact object, account for base structure +
13002 character data. */
13003 if (PyUnicode_IS_COMPACT_ASCII(v))
13004 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13005 else if (PyUnicode_IS_COMPACT(v))
13006 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013007 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 else {
13009 /* If it is a two-block object, account for base object, and
13010 for character block if present. */
13011 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013012 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013014 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 }
13016 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013017 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013018 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013020 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013021 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022
13023 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013024}
13025
13026PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013028
13029static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013030unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013031{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013032 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 if (!copy)
13034 return NULL;
13035 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013036}
13037
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013039 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013040 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013041 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13042 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013043 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13044 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013045 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013046 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13047 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13048 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13049 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13050 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013051 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013052 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13053 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13054 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013055 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013056 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13057 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13058 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013059 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013061 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013062 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013063 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13064 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13065 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13066 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13067 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13068 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13069 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13070 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13071 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13072 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13073 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13074 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13075 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13076 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013077 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013078 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013079 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013080 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013081 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013082 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013083 {"maketrans", (PyCFunction) unicode_maketrans,
13084 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013085 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013086#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013087 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013088 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089#endif
13090
Benjamin Peterson14339b62009-01-31 16:36:08 +000013091 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 {NULL, NULL}
13093};
13094
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013095static PyObject *
13096unicode_mod(PyObject *v, PyObject *w)
13097{
Brian Curtindfc80e32011-08-10 20:28:54 -050013098 if (!PyUnicode_Check(v))
13099 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013101}
13102
13103static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013104 0, /*nb_add*/
13105 0, /*nb_subtract*/
13106 0, /*nb_multiply*/
13107 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013108};
13109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013111 (lenfunc) unicode_length, /* sq_length */
13112 PyUnicode_Concat, /* sq_concat */
13113 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13114 (ssizeargfunc) unicode_getitem, /* sq_item */
13115 0, /* sq_slice */
13116 0, /* sq_ass_item */
13117 0, /* sq_ass_slice */
13118 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119};
13120
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013121static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 if (PyUnicode_READY(self) == -1)
13125 return NULL;
13126
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013127 if (PyIndex_Check(item)) {
13128 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013129 if (i == -1 && PyErr_Occurred())
13130 return NULL;
13131 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013133 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013134 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013135 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013136 PyObject *result;
13137 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013138 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013139 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013143 return NULL;
13144 }
13145
13146 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013147 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013149 slicelength == PyUnicode_GET_LENGTH(self)) {
13150 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013151 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013152 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013153 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013154 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013155 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013156 src_kind = PyUnicode_KIND(self);
13157 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013158 if (!PyUnicode_IS_ASCII(self)) {
13159 kind_limit = kind_maxchar_limit(src_kind);
13160 max_char = 0;
13161 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13162 ch = PyUnicode_READ(src_kind, src_data, cur);
13163 if (ch > max_char) {
13164 max_char = ch;
13165 if (max_char >= kind_limit)
13166 break;
13167 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013168 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013169 }
Victor Stinner55c99112011-10-13 01:17:06 +020013170 else
13171 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013172 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013173 if (result == NULL)
13174 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013175 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013176 dest_data = PyUnicode_DATA(result);
13177
13178 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013179 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13180 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013181 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013182 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013183 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013184 } else {
13185 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13186 return NULL;
13187 }
13188}
13189
13190static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013191 (lenfunc)unicode_length, /* mp_length */
13192 (binaryfunc)unicode_subscript, /* mp_subscript */
13193 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013194};
13195
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197/* Helpers for PyUnicode_Format() */
13198
13199static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013200getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013202 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 (*p_argidx)++;
13205 if (arglen < 0)
13206 return args;
13207 else
13208 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209 }
13210 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212 return NULL;
13213}
13214
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013215/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216
Victor Stinnerd3f08822012-05-29 12:57:52 +020013217static int
13218formatfloat(PyObject *v, int flags, int prec, int type,
13219 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013221 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013223 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013224
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225 x = PyFloat_AsDouble(v);
13226 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013227 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013228
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013231
Eric Smith0923d1d2009-04-16 20:16:10 +000013232 p = PyOS_double_to_string(x, type, prec,
13233 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013234 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013235 return -1;
13236 len = strlen(p);
13237 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013238 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13239 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013240 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013241 }
Victor Stinner184252a2012-06-16 02:57:41 +020013242 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013243 writer->pos += len;
13244 }
13245 else
13246 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013247 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013248 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249}
13250
Victor Stinnerd0880d52012-04-27 23:40:13 +020013251/* formatlong() emulates the format codes d, u, o, x and X, and
13252 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13253 * Python's regular ints.
13254 * Return value: a new PyUnicodeObject*, or NULL if error.
13255 * The output string is of the form
13256 * "-"? ("0x" | "0X")? digit+
13257 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13258 * set in flags. The case of hex digits will be correct,
13259 * There will be at least prec digits, zero-filled on the left if
13260 * necessary to get that many.
13261 * val object to be converted
13262 * flags bitmask of format flags; only F_ALT is looked at
13263 * prec minimum number of digits; 0-fill on left if needed
13264 * type a character in [duoxX]; u acts the same as d
13265 *
13266 * CAUTION: o, x and X conversions on regular ints can never
13267 * produce a '-' sign, but can for Python's unbounded ints.
13268 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013269static PyObject*
13270formatlong(PyObject *val, int flags, int prec, int type)
13271{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013272 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013274 Py_ssize_t i;
13275 int sign; /* 1 if '-', else 0 */
13276 int len; /* number of characters */
13277 Py_ssize_t llen;
13278 int numdigits; /* len == numnondigits + numdigits */
13279 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013280
Victor Stinnerd0880d52012-04-27 23:40:13 +020013281 /* Avoid exceeding SSIZE_T_MAX */
13282 if (prec > INT_MAX-3) {
13283 PyErr_SetString(PyExc_OverflowError,
13284 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013285 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013286 }
13287
13288 assert(PyLong_Check(val));
13289
13290 switch (type) {
13291 case 'd':
13292 case 'u':
13293 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013294 if (PyBool_Check(val))
13295 result = PyNumber_ToBase(val, 10);
13296 else
13297 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013298 break;
13299 case 'o':
13300 numnondigits = 2;
13301 result = PyNumber_ToBase(val, 8);
13302 break;
13303 case 'x':
13304 case 'X':
13305 numnondigits = 2;
13306 result = PyNumber_ToBase(val, 16);
13307 break;
13308 default:
13309 assert(!"'type' not in [duoxX]");
13310 }
13311 if (!result)
13312 return NULL;
13313
13314 assert(unicode_modifiable(result));
13315 assert(PyUnicode_IS_READY(result));
13316 assert(PyUnicode_IS_ASCII(result));
13317
13318 /* To modify the string in-place, there can only be one reference. */
13319 if (Py_REFCNT(result) != 1) {
13320 PyErr_BadInternalCall();
13321 return NULL;
13322 }
13323 buf = PyUnicode_DATA(result);
13324 llen = PyUnicode_GET_LENGTH(result);
13325 if (llen > INT_MAX) {
13326 PyErr_SetString(PyExc_ValueError,
13327 "string too large in _PyBytes_FormatLong");
13328 return NULL;
13329 }
13330 len = (int)llen;
13331 sign = buf[0] == '-';
13332 numnondigits += sign;
13333 numdigits = len - numnondigits;
13334 assert(numdigits > 0);
13335
13336 /* Get rid of base marker unless F_ALT */
13337 if (((flags & F_ALT) == 0 &&
13338 (type == 'o' || type == 'x' || type == 'X'))) {
13339 assert(buf[sign] == '0');
13340 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13341 buf[sign+1] == 'o');
13342 numnondigits -= 2;
13343 buf += 2;
13344 len -= 2;
13345 if (sign)
13346 buf[0] = '-';
13347 assert(len == numnondigits + numdigits);
13348 assert(numdigits > 0);
13349 }
13350
13351 /* Fill with leading zeroes to meet minimum width. */
13352 if (prec > numdigits) {
13353 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13354 numnondigits + prec);
13355 char *b1;
13356 if (!r1) {
13357 Py_DECREF(result);
13358 return NULL;
13359 }
13360 b1 = PyBytes_AS_STRING(r1);
13361 for (i = 0; i < numnondigits; ++i)
13362 *b1++ = *buf++;
13363 for (i = 0; i < prec - numdigits; i++)
13364 *b1++ = '0';
13365 for (i = 0; i < numdigits; i++)
13366 *b1++ = *buf++;
13367 *b1 = '\0';
13368 Py_DECREF(result);
13369 result = r1;
13370 buf = PyBytes_AS_STRING(result);
13371 len = numnondigits + prec;
13372 }
13373
13374 /* Fix up case for hex conversions. */
13375 if (type == 'X') {
13376 /* Need to convert all lower case letters to upper case.
13377 and need to convert 0x to 0X (and -0x to -0X). */
13378 for (i = 0; i < len; i++)
13379 if (buf[i] >= 'a' && buf[i] <= 'x')
13380 buf[i] -= 'a'-'A';
13381 }
13382 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13383 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013384 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013385 Py_DECREF(result);
13386 result = unicode;
13387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013388 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013389}
13390
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013391static Py_UCS4
13392formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013393{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013394 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013395 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 goto onError;
13400 }
13401 else {
13402 /* Integer input truncated to a character */
13403 long x;
13404 x = PyLong_AsLong(v);
13405 if (x == -1 && PyErr_Occurred())
13406 goto onError;
13407
Victor Stinner8faf8212011-12-08 22:14:11 +010013408 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 PyErr_SetString(PyExc_OverflowError,
13410 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 }
13413
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013416
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013418 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421}
13422
Alexander Belopolsky40018472011-02-26 01:02:56 +000013423PyObject *
13424PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013426 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013429 PyObject *temp = NULL;
13430 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013431 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013432 void *fmt;
13433 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013434 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013435 Py_ssize_t sublen;
13436 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013437
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 PyErr_BadInternalCall();
13440 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013442 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013443 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013445 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013446 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013447 return NULL;
13448 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 fmt = PyUnicode_DATA(uformat);
13451 fmtkind = PyUnicode_KIND(uformat);
13452 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13453 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454
Victor Stinnerd3f08822012-05-29 12:57:52 +020013455 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013456
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 arglen = PyTuple_Size(args);
13459 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460 }
13461 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 arglen = -1;
13463 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013464 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013465 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013467
13468 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013470 Py_ssize_t nonfmtpos;
13471 nonfmtpos = fmtpos++;
13472 while (fmtcnt >= 0 &&
13473 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13474 fmtpos++;
13475 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013477 if (fmtcnt < 0)
13478 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013479 sublen = fmtpos - nonfmtpos;
13480 maxchar = _PyUnicode_FindMaxChar(uformat,
13481 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013482 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013483 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013484
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13486 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013487 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013488 }
13489 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 /* Got a format specifier */
13491 int flags = 0;
13492 Py_ssize_t width = -1;
13493 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013494 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013495 Py_UCS4 fill;
13496 int sign;
13497 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 int isnumok;
13499 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013500 void *pbuf = NULL;
13501 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013502 Py_UCS4 bufmaxchar;
13503 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013506 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13507 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 Py_ssize_t keylen;
13510 PyObject *key;
13511 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013512
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 if (dict == NULL) {
13514 PyErr_SetString(PyExc_TypeError,
13515 "format requires a mapping");
13516 goto onError;
13517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 /* Skip over balanced parentheses */
13522 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013523 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13524 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013525 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013526 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013528 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 if (fmtcnt < 0 || pcount > 0) {
13532 PyErr_SetString(PyExc_ValueError,
13533 "incomplete format key");
13534 goto onError;
13535 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013536 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013537 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 if (key == NULL)
13539 goto onError;
13540 if (args_owned) {
13541 Py_DECREF(args);
13542 args_owned = 0;
13543 }
13544 args = PyObject_GetItem(dict, key);
13545 Py_DECREF(key);
13546 if (args == NULL) {
13547 goto onError;
13548 }
13549 args_owned = 1;
13550 arglen = -1;
13551 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013552 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013554 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13555 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 case '-': flags |= F_LJUST; continue;
13557 case '+': flags |= F_SIGN; continue;
13558 case ' ': flags |= F_BLANK; continue;
13559 case '#': flags |= F_ALT; continue;
13560 case '0': flags |= F_ZERO; continue;
13561 }
13562 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013563 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 if (c == '*') {
13565 v = getnextarg(args, arglen, &argidx);
13566 if (v == NULL)
13567 goto onError;
13568 if (!PyLong_Check(v)) {
13569 PyErr_SetString(PyExc_TypeError,
13570 "* wants int");
13571 goto onError;
13572 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013573 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 if (width == -1 && PyErr_Occurred())
13575 goto onError;
13576 if (width < 0) {
13577 flags |= F_LJUST;
13578 width = -width;
13579 }
13580 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 }
13583 else if (c >= '0' && c <= '9') {
13584 width = c - '0';
13585 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013586 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 if (c < '0' || c > '9')
13588 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013589 /* Since c is unsigned, the RHS would end up as unsigned,
13590 mixing signed and unsigned comparison. Since c is between
13591 '0' and '9', casting to int is safe. */
13592 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013593 PyErr_SetString(PyExc_ValueError,
13594 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 }
13597 width = width*10 + (c - '0');
13598 }
13599 }
13600 if (c == '.') {
13601 prec = 0;
13602 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 if (c == '*') {
13605 v = getnextarg(args, arglen, &argidx);
13606 if (v == NULL)
13607 goto onError;
13608 if (!PyLong_Check(v)) {
13609 PyErr_SetString(PyExc_TypeError,
13610 "* wants int");
13611 goto onError;
13612 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013613 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 if (prec == -1 && PyErr_Occurred())
13615 goto onError;
13616 if (prec < 0)
13617 prec = 0;
13618 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013619 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 }
13621 else if (c >= '0' && c <= '9') {
13622 prec = c - '0';
13623 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 if (c < '0' || c > '9')
13626 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013627 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013628 PyErr_SetString(PyExc_ValueError,
13629 "prec too big");
13630 goto onError;
13631 }
13632 prec = prec*10 + (c - '0');
13633 }
13634 }
13635 } /* prec */
13636 if (fmtcnt >= 0) {
13637 if (c == 'h' || c == 'l' || c == 'L') {
13638 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013639 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 }
13641 }
13642 if (fmtcnt < 0) {
13643 PyErr_SetString(PyExc_ValueError,
13644 "incomplete format");
13645 goto onError;
13646 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013647 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013648 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013649
13650 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013651 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013652 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013653 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13654 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013655 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013656 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013657
Victor Stinneraff3cc62012-04-30 05:19:21 +020013658 v = getnextarg(args, arglen, &argidx);
13659 if (v == NULL)
13660 goto onError;
13661
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013663 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 fill = ' ';
13665 switch (c) {
13666
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 case 's':
13668 case 'r':
13669 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013670 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13671 /* Fast path */
13672 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13673 goto onError;
13674 goto nextarg;
13675 }
13676
Victor Stinner808fc0a2010-03-22 12:50:40 +000013677 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 temp = v;
13679 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 }
13681 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 if (c == 's')
13683 temp = PyObject_Str(v);
13684 else if (c == 'r')
13685 temp = PyObject_Repr(v);
13686 else
13687 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 break;
13690
13691 case 'i':
13692 case 'd':
13693 case 'u':
13694 case 'o':
13695 case 'x':
13696 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013697 if (PyLong_CheckExact(v)
13698 && width == -1 && prec == -1
13699 && !(flags & (F_SIGN | F_BLANK)))
13700 {
13701 /* Fast path */
13702 switch(c)
13703 {
13704 case 'd':
13705 case 'i':
13706 case 'u':
13707 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13708 goto onError;
13709 goto nextarg;
13710 case 'x':
13711 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13712 goto onError;
13713 goto nextarg;
13714 case 'o':
13715 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13716 goto onError;
13717 goto nextarg;
13718 default:
13719 break;
13720 }
13721 }
13722
Benjamin Peterson29060642009-01-31 22:14:21 +000013723 isnumok = 0;
13724 if (PyNumber_Check(v)) {
13725 PyObject *iobj=NULL;
13726
13727 if (PyLong_Check(v)) {
13728 iobj = v;
13729 Py_INCREF(iobj);
13730 }
13731 else {
13732 iobj = PyNumber_Long(v);
13733 }
13734 if (iobj!=NULL) {
13735 if (PyLong_Check(iobj)) {
13736 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013737 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013738 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 }
13741 else {
13742 Py_DECREF(iobj);
13743 }
13744 }
13745 }
13746 if (!isnumok) {
13747 PyErr_Format(PyExc_TypeError,
13748 "%%%c format: a number is required, "
13749 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13750 goto onError;
13751 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013752 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 fill = '0';
13754 break;
13755
13756 case 'e':
13757 case 'E':
13758 case 'f':
13759 case 'F':
13760 case 'g':
13761 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013762 if (width == -1 && prec == -1
13763 && !(flags & (F_SIGN | F_BLANK)))
13764 {
13765 /* Fast path */
13766 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13767 goto onError;
13768 goto nextarg;
13769 }
13770
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013772 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013774 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13775 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 break;
13777
13778 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013779 {
13780 Py_UCS4 ch = formatchar(v);
13781 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013783 if (width == -1 && prec == -1) {
13784 /* Fast path */
13785 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13786 goto onError;
13787 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13788 writer.pos += 1;
13789 goto nextarg;
13790 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013791 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013793 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013794
13795 default:
13796 PyErr_Format(PyExc_ValueError,
13797 "unsupported format character '%c' (0x%x) "
13798 "at index %zd",
13799 (31<=c && c<=126) ? (char)c : '?',
13800 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013802 goto onError;
13803 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013804 if (temp == NULL)
13805 goto onError;
13806 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013807
13808 if (width == -1 && prec == -1
13809 && !(flags & (F_SIGN | F_BLANK)))
13810 {
13811 /* Fast path */
13812 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13813 goto onError;
13814 goto nextarg;
13815 }
13816
Victor Stinneraff3cc62012-04-30 05:19:21 +020013817 if (PyUnicode_READY(temp) == -1) {
13818 Py_CLEAR(temp);
13819 goto onError;
13820 }
13821 kind = PyUnicode_KIND(temp);
13822 pbuf = PyUnicode_DATA(temp);
13823 len = PyUnicode_GET_LENGTH(temp);
13824
13825 if (c == 's' || c == 'r' || c == 'a') {
13826 if (prec >= 0 && len > prec)
13827 len = prec;
13828 }
13829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013830 /* pbuf is initialized here. */
13831 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013832 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013833 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13834 if (ch == '-' || ch == '+') {
13835 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013836 len--;
13837 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013838 }
13839 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013840 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013841 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013842 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 else
13844 sign = 0;
13845 }
13846 if (width < len)
13847 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013848
13849 /* Compute the length and maximum character of the
13850 written characters */
13851 bufmaxchar = 127;
13852 if (!(flags & F_LJUST)) {
13853 if (sign) {
13854 if ((width-1) > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013855 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013856 }
13857 else {
13858 if (width > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013859 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013860 }
13861 }
13862 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Benjamin Peterson7e303732013-06-10 09:19:46 -070013863 bufmaxchar = Py_MAX(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013864
13865 buflen = width;
13866 if (sign && len == width)
13867 buflen++;
13868
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013869 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013870 goto onError;
13871
13872 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013873 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013874 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013875 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13876 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013877 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 if (width > len)
13879 width--;
13880 }
13881 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013882 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013883 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013885 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13886 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13887 writer.pos += 2;
13888 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013889 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 width -= 2;
13891 if (width < 0)
13892 width = 0;
13893 len -= 2;
13894 }
13895 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013896 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013897 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13898 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013899 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 }
13901 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013902 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013903 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13904 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013905 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013907 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13908 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013909 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13910 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13911 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013912 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013913 }
13914 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013915
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013916 if (len) {
13917 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13918 temp, pindex, len);
13919 writer.pos += len;
13920 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013921 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013922 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013923 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13924 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013925 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013926
Victor Stinnerd3f08822012-05-29 12:57:52 +020013927nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 if (dict && (argidx < arglen) && c != '%') {
13929 PyErr_SetString(PyExc_TypeError,
13930 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013931 goto onError;
13932 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013933 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935 } /* until end */
13936 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 PyErr_SetString(PyExc_TypeError,
13938 "not all arguments converted during string formatting");
13939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940 }
13941
13942 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013943 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944 }
13945 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013946 Py_XDECREF(temp);
13947 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013948 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949
Benjamin Peterson29060642009-01-31 22:14:21 +000013950 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013952 Py_XDECREF(temp);
13953 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013954 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013955 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013956 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013957 }
13958 return NULL;
13959}
13960
Jeremy Hylton938ace62002-07-17 16:30:39 +000013961static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013962unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13963
Tim Peters6d6c1a32001-08-02 04:15:00 +000013964static PyObject *
13965unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13966{
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 static char *kwlist[] = {"object", "encoding", "errors", 0};
13969 char *encoding = NULL;
13970 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013971
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 if (type != &PyUnicode_Type)
13973 return unicode_subtype_new(type, args, kwds);
13974 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013975 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 return NULL;
13977 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020013978 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 if (encoding == NULL && errors == NULL)
13980 return PyObject_Str(x);
13981 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013982 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013983}
13984
Guido van Rossume023fe02001-08-30 03:12:59 +000013985static PyObject *
13986unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13987{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013988 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013989 Py_ssize_t length, char_size;
13990 int share_wstr, share_utf8;
13991 unsigned int kind;
13992 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013993
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013995
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013996 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013997 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013998 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013999 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014000 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014001 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014002 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014003 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014004
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014005 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014006 if (self == NULL) {
14007 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 return NULL;
14009 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014010 kind = PyUnicode_KIND(unicode);
14011 length = PyUnicode_GET_LENGTH(unicode);
14012
14013 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014014#ifdef Py_DEBUG
14015 _PyUnicode_HASH(self) = -1;
14016#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014017 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014018#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014019 _PyUnicode_STATE(self).interned = 0;
14020 _PyUnicode_STATE(self).kind = kind;
14021 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014022 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014023 _PyUnicode_STATE(self).ready = 1;
14024 _PyUnicode_WSTR(self) = NULL;
14025 _PyUnicode_UTF8_LENGTH(self) = 0;
14026 _PyUnicode_UTF8(self) = NULL;
14027 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014028 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014029
14030 share_utf8 = 0;
14031 share_wstr = 0;
14032 if (kind == PyUnicode_1BYTE_KIND) {
14033 char_size = 1;
14034 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14035 share_utf8 = 1;
14036 }
14037 else if (kind == PyUnicode_2BYTE_KIND) {
14038 char_size = 2;
14039 if (sizeof(wchar_t) == 2)
14040 share_wstr = 1;
14041 }
14042 else {
14043 assert(kind == PyUnicode_4BYTE_KIND);
14044 char_size = 4;
14045 if (sizeof(wchar_t) == 4)
14046 share_wstr = 1;
14047 }
14048
14049 /* Ensure we won't overflow the length. */
14050 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14051 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014052 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014054 data = PyObject_MALLOC((length + 1) * char_size);
14055 if (data == NULL) {
14056 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014057 goto onError;
14058 }
14059
Victor Stinnerc3c74152011-10-02 20:39:55 +020014060 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014061 if (share_utf8) {
14062 _PyUnicode_UTF8_LENGTH(self) = length;
14063 _PyUnicode_UTF8(self) = data;
14064 }
14065 if (share_wstr) {
14066 _PyUnicode_WSTR_LENGTH(self) = length;
14067 _PyUnicode_WSTR(self) = (wchar_t *)data;
14068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014069
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014071 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014072 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014073#ifdef Py_DEBUG
14074 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14075#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014076 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014077 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014078
14079onError:
14080 Py_DECREF(unicode);
14081 Py_DECREF(self);
14082 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014083}
14084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014085PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014086"str(object='') -> str\n\
14087str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014088\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014089Create a new string object from the given object. If encoding or\n\
14090errors is specified, then the object must expose a data buffer\n\
14091that will be decoded using the given encoding and error handler.\n\
14092Otherwise, returns the result of object.__str__() (if defined)\n\
14093or repr(object).\n\
14094encoding defaults to sys.getdefaultencoding().\n\
14095errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014096
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014097static PyObject *unicode_iter(PyObject *seq);
14098
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014100 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014101 "str", /* tp_name */
14102 sizeof(PyUnicodeObject), /* tp_size */
14103 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 (destructor)unicode_dealloc, /* tp_dealloc */
14106 0, /* tp_print */
14107 0, /* tp_getattr */
14108 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014109 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014110 unicode_repr, /* tp_repr */
14111 &unicode_as_number, /* tp_as_number */
14112 &unicode_as_sequence, /* tp_as_sequence */
14113 &unicode_as_mapping, /* tp_as_mapping */
14114 (hashfunc) unicode_hash, /* tp_hash*/
14115 0, /* tp_call*/
14116 (reprfunc) unicode_str, /* tp_str */
14117 PyObject_GenericGetAttr, /* tp_getattro */
14118 0, /* tp_setattro */
14119 0, /* tp_as_buffer */
14120 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014121 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 unicode_doc, /* tp_doc */
14123 0, /* tp_traverse */
14124 0, /* tp_clear */
14125 PyUnicode_RichCompare, /* tp_richcompare */
14126 0, /* tp_weaklistoffset */
14127 unicode_iter, /* tp_iter */
14128 0, /* tp_iternext */
14129 unicode_methods, /* tp_methods */
14130 0, /* tp_members */
14131 0, /* tp_getset */
14132 &PyBaseObject_Type, /* tp_base */
14133 0, /* tp_dict */
14134 0, /* tp_descr_get */
14135 0, /* tp_descr_set */
14136 0, /* tp_dictoffset */
14137 0, /* tp_init */
14138 0, /* tp_alloc */
14139 unicode_new, /* tp_new */
14140 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014141};
14142
14143/* Initialize the Unicode implementation */
14144
Victor Stinner3a50e702011-10-18 21:21:00 +020014145int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014146{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014147 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014148 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014149 0x000A, /* LINE FEED */
14150 0x000D, /* CARRIAGE RETURN */
14151 0x001C, /* FILE SEPARATOR */
14152 0x001D, /* GROUP SEPARATOR */
14153 0x001E, /* RECORD SEPARATOR */
14154 0x0085, /* NEXT LINE */
14155 0x2028, /* LINE SEPARATOR */
14156 0x2029, /* PARAGRAPH SEPARATOR */
14157 };
14158
Fred Drakee4315f52000-05-09 19:53:39 +000014159 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014160 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014161 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014162 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014163 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014164
Guido van Rossumcacfc072002-05-24 19:01:59 +000014165 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014167
14168 /* initialize the linebreak bloom filter */
14169 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014170 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014171 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014172
Christian Heimes26532f72013-07-20 14:57:16 +020014173 if (PyType_Ready(&EncodingMapType) < 0)
14174 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014175
Benjamin Petersonc4311282012-10-30 23:21:10 -040014176 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14177 Py_FatalError("Can't initialize field name iterator type");
14178
14179 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14180 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014181
Victor Stinner3a50e702011-10-18 21:21:00 +020014182#ifdef HAVE_MBCS
14183 winver.dwOSVersionInfoSize = sizeof(winver);
14184 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14185 PyErr_SetFromWindowsErr(0);
14186 return -1;
14187 }
14188#endif
14189 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190}
14191
14192/* Finalize the Unicode implementation */
14193
Christian Heimesa156e092008-02-16 07:38:31 +000014194int
14195PyUnicode_ClearFreeList(void)
14196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014197 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014198}
14199
Guido van Rossumd57fd912000-03-10 22:53:23 +000014200void
Thomas Wouters78890102000-07-22 19:25:51 +000014201_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014202{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014203 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014204
Serhiy Storchaka05997252013-01-26 12:14:02 +020014205 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014206
Serhiy Storchaka05997252013-01-26 12:14:02 +020014207 for (i = 0; i < 256; i++)
14208 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014209 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014210 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014211}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014212
Walter Dörwald16807132007-05-25 13:52:07 +000014213void
14214PyUnicode_InternInPlace(PyObject **p)
14215{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014216 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014218#ifdef Py_DEBUG
14219 assert(s != NULL);
14220 assert(_PyUnicode_CHECK(s));
14221#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014223 return;
14224#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 /* If it's a subclass, we don't really know what putting
14226 it in the interned dict might do. */
14227 if (!PyUnicode_CheckExact(s))
14228 return;
14229 if (PyUnicode_CHECK_INTERNED(s))
14230 return;
14231 if (interned == NULL) {
14232 interned = PyDict_New();
14233 if (interned == NULL) {
14234 PyErr_Clear(); /* Don't leave an exception */
14235 return;
14236 }
14237 }
14238 /* It might be that the GetItem call fails even
14239 though the key is present in the dictionary,
14240 namely when this happens during a stack overflow. */
14241 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014242 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014244
Benjamin Peterson29060642009-01-31 22:14:21 +000014245 if (t) {
14246 Py_INCREF(t);
14247 Py_DECREF(*p);
14248 *p = t;
14249 return;
14250 }
Walter Dörwald16807132007-05-25 13:52:07 +000014251
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014253 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014254 PyErr_Clear();
14255 PyThreadState_GET()->recursion_critical = 0;
14256 return;
14257 }
14258 PyThreadState_GET()->recursion_critical = 0;
14259 /* The two references in interned are not counted by refcnt.
14260 The deallocator will take care of this */
14261 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014262 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014263}
14264
14265void
14266PyUnicode_InternImmortal(PyObject **p)
14267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 PyUnicode_InternInPlace(p);
14269 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014270 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 Py_INCREF(*p);
14272 }
Walter Dörwald16807132007-05-25 13:52:07 +000014273}
14274
14275PyObject *
14276PyUnicode_InternFromString(const char *cp)
14277{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014278 PyObject *s = PyUnicode_FromString(cp);
14279 if (s == NULL)
14280 return NULL;
14281 PyUnicode_InternInPlace(&s);
14282 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014283}
14284
Alexander Belopolsky40018472011-02-26 01:02:56 +000014285void
14286_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014287{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014288 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014289 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 Py_ssize_t i, n;
14291 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014292
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 if (interned == NULL || !PyDict_Check(interned))
14294 return;
14295 keys = PyDict_Keys(interned);
14296 if (keys == NULL || !PyList_Check(keys)) {
14297 PyErr_Clear();
14298 return;
14299 }
Walter Dörwald16807132007-05-25 13:52:07 +000014300
Benjamin Peterson14339b62009-01-31 16:36:08 +000014301 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14302 detector, interned unicode strings are not forcibly deallocated;
14303 rather, we give them their stolen references back, and then clear
14304 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014305
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 n = PyList_GET_SIZE(keys);
14307 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014308 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014310 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014311 if (PyUnicode_READY(s) == -1) {
14312 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014313 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014315 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 case SSTATE_NOT_INTERNED:
14317 /* XXX Shouldn't happen */
14318 break;
14319 case SSTATE_INTERNED_IMMORTAL:
14320 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014321 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014322 break;
14323 case SSTATE_INTERNED_MORTAL:
14324 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014325 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 break;
14327 default:
14328 Py_FatalError("Inconsistent interned string state.");
14329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014330 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014331 }
14332 fprintf(stderr, "total size of all interned strings: "
14333 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14334 "mortal/immortal\n", mortal_size, immortal_size);
14335 Py_DECREF(keys);
14336 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014337 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014338}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014339
14340
14341/********************* Unicode Iterator **************************/
14342
14343typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014344 PyObject_HEAD
14345 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014346 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014347} unicodeiterobject;
14348
14349static void
14350unicodeiter_dealloc(unicodeiterobject *it)
14351{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 _PyObject_GC_UNTRACK(it);
14353 Py_XDECREF(it->it_seq);
14354 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014355}
14356
14357static int
14358unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14359{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014360 Py_VISIT(it->it_seq);
14361 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014362}
14363
14364static PyObject *
14365unicodeiter_next(unicodeiterobject *it)
14366{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014367 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014368
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 assert(it != NULL);
14370 seq = it->it_seq;
14371 if (seq == NULL)
14372 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014373 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014375 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14376 int kind = PyUnicode_KIND(seq);
14377 void *data = PyUnicode_DATA(seq);
14378 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14379 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 if (item != NULL)
14381 ++it->it_index;
14382 return item;
14383 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014384
Benjamin Peterson14339b62009-01-31 16:36:08 +000014385 Py_DECREF(seq);
14386 it->it_seq = NULL;
14387 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014388}
14389
14390static PyObject *
14391unicodeiter_len(unicodeiterobject *it)
14392{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014393 Py_ssize_t len = 0;
14394 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014395 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014397}
14398
14399PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14400
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014401static PyObject *
14402unicodeiter_reduce(unicodeiterobject *it)
14403{
14404 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014405 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014406 it->it_seq, it->it_index);
14407 } else {
14408 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14409 if (u == NULL)
14410 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014411 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014412 }
14413}
14414
14415PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14416
14417static PyObject *
14418unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14419{
14420 Py_ssize_t index = PyLong_AsSsize_t(state);
14421 if (index == -1 && PyErr_Occurred())
14422 return NULL;
14423 if (index < 0)
14424 index = 0;
14425 it->it_index = index;
14426 Py_RETURN_NONE;
14427}
14428
14429PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14430
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014431static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014432 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014434 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14435 reduce_doc},
14436 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14437 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014438 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014439};
14440
14441PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014442 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14443 "str_iterator", /* tp_name */
14444 sizeof(unicodeiterobject), /* tp_basicsize */
14445 0, /* tp_itemsize */
14446 /* methods */
14447 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14448 0, /* tp_print */
14449 0, /* tp_getattr */
14450 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014451 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014452 0, /* tp_repr */
14453 0, /* tp_as_number */
14454 0, /* tp_as_sequence */
14455 0, /* tp_as_mapping */
14456 0, /* tp_hash */
14457 0, /* tp_call */
14458 0, /* tp_str */
14459 PyObject_GenericGetAttr, /* tp_getattro */
14460 0, /* tp_setattro */
14461 0, /* tp_as_buffer */
14462 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14463 0, /* tp_doc */
14464 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14465 0, /* tp_clear */
14466 0, /* tp_richcompare */
14467 0, /* tp_weaklistoffset */
14468 PyObject_SelfIter, /* tp_iter */
14469 (iternextfunc)unicodeiter_next, /* tp_iternext */
14470 unicodeiter_methods, /* tp_methods */
14471 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014472};
14473
14474static PyObject *
14475unicode_iter(PyObject *seq)
14476{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014477 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014478
Benjamin Peterson14339b62009-01-31 16:36:08 +000014479 if (!PyUnicode_Check(seq)) {
14480 PyErr_BadInternalCall();
14481 return NULL;
14482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014483 if (PyUnicode_READY(seq) == -1)
14484 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14486 if (it == NULL)
14487 return NULL;
14488 it->it_index = 0;
14489 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014490 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014491 _PyObject_GC_TRACK(it);
14492 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014493}
14494
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014495
14496size_t
14497Py_UNICODE_strlen(const Py_UNICODE *u)
14498{
14499 int res = 0;
14500 while(*u++)
14501 res++;
14502 return res;
14503}
14504
14505Py_UNICODE*
14506Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14507{
14508 Py_UNICODE *u = s1;
14509 while ((*u++ = *s2++));
14510 return s1;
14511}
14512
14513Py_UNICODE*
14514Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14515{
14516 Py_UNICODE *u = s1;
14517 while ((*u++ = *s2++))
14518 if (n-- == 0)
14519 break;
14520 return s1;
14521}
14522
14523Py_UNICODE*
14524Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14525{
14526 Py_UNICODE *u1 = s1;
14527 u1 += Py_UNICODE_strlen(u1);
14528 Py_UNICODE_strcpy(u1, s2);
14529 return s1;
14530}
14531
14532int
14533Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14534{
14535 while (*s1 && *s2 && *s1 == *s2)
14536 s1++, s2++;
14537 if (*s1 && *s2)
14538 return (*s1 < *s2) ? -1 : +1;
14539 if (*s1)
14540 return 1;
14541 if (*s2)
14542 return -1;
14543 return 0;
14544}
14545
14546int
14547Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14548{
14549 register Py_UNICODE u1, u2;
14550 for (; n != 0; n--) {
14551 u1 = *s1;
14552 u2 = *s2;
14553 if (u1 != u2)
14554 return (u1 < u2) ? -1 : +1;
14555 if (u1 == '\0')
14556 return 0;
14557 s1++;
14558 s2++;
14559 }
14560 return 0;
14561}
14562
14563Py_UNICODE*
14564Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14565{
14566 const Py_UNICODE *p;
14567 for (p = s; *p; p++)
14568 if (*p == c)
14569 return (Py_UNICODE*)p;
14570 return NULL;
14571}
14572
14573Py_UNICODE*
14574Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14575{
14576 const Py_UNICODE *p;
14577 p = s + Py_UNICODE_strlen(s);
14578 while (p != s) {
14579 p--;
14580 if (*p == c)
14581 return (Py_UNICODE*)p;
14582 }
14583 return NULL;
14584}
Victor Stinner331ea922010-08-10 16:37:20 +000014585
Victor Stinner71133ff2010-09-01 23:43:53 +000014586Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014587PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014588{
Victor Stinner577db2c2011-10-11 22:12:48 +020014589 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014590 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014592 if (!PyUnicode_Check(unicode)) {
14593 PyErr_BadArgument();
14594 return NULL;
14595 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014596 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014597 if (u == NULL)
14598 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014599 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014600 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014601 PyErr_NoMemory();
14602 return NULL;
14603 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014604 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014605 size *= sizeof(Py_UNICODE);
14606 copy = PyMem_Malloc(size);
14607 if (copy == NULL) {
14608 PyErr_NoMemory();
14609 return NULL;
14610 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014611 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014612 return copy;
14613}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014614
Georg Brandl66c221e2010-10-14 07:04:07 +000014615/* A _string module, to export formatter_parser and formatter_field_name_split
14616 to the string.Formatter class implemented in Python. */
14617
14618static PyMethodDef _string_methods[] = {
14619 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14620 METH_O, PyDoc_STR("split the argument as a field name")},
14621 {"formatter_parser", (PyCFunction) formatter_parser,
14622 METH_O, PyDoc_STR("parse the argument as a format string")},
14623 {NULL, NULL}
14624};
14625
14626static struct PyModuleDef _string_module = {
14627 PyModuleDef_HEAD_INIT,
14628 "_string",
14629 PyDoc_STR("string helper module"),
14630 0,
14631 _string_methods,
14632 NULL,
14633 NULL,
14634 NULL,
14635 NULL
14636};
14637
14638PyMODINIT_FUNC
14639PyInit__string(void)
14640{
14641 return PyModule_Create(&_string_module);
14642}
14643
14644
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014645#ifdef __cplusplus
14646}
14647#endif