blob: 156316b1543263b6f6d11922bfa57aded6d4695c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinner910337b2011-10-03 03:20:16 +0200115#undef PyUnicode_READY
116#define PyUnicode_READY(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200119 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100120 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200121
Victor Stinnerc379ead2011-10-03 12:52:27 +0200122#define _PyUnicode_SHARE_UTF8(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
125 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
126#define _PyUnicode_SHARE_WSTR(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
129
Victor Stinner829c0ad2011-10-03 01:08:02 +0200130/* true if the Unicode object has an allocated UTF-8 memory block
131 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_HAS_UTF8_MEMORY(op) \
133 (assert(_PyUnicode_CHECK(op)), \
134 (!PyUnicode_IS_COMPACT_ASCII(op) \
135 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200136 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
137
Victor Stinner03490912011-10-03 23:45:12 +0200138/* true if the Unicode object has an allocated wstr memory block
139 (not shared with other data) */
140#define _PyUnicode_HAS_WSTR_MEMORY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (_PyUnicode_WSTR(op) && \
143 (!PyUnicode_IS_READY(op) || \
144 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
145
Victor Stinner910337b2011-10-03 03:20:16 +0200146/* Generic helper macro to convert characters of different types.
147 from_type and to_type have to be valid type names, begin and end
148 are pointers to the source characters which should be of type
149 "from_type *". to is a pointer of type "to_type *" and points to the
150 buffer where the result characters are written to. */
151#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
152 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200153 to_type *_to = (to_type *) to; \
154 const from_type *_iter = (begin); \
155 const from_type *_end = (end); \
156 Py_ssize_t n = (_end) - (_iter); \
157 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200158 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200159 while (_iter < (_unrolled_end)) { \
160 _to[0] = (to_type) _iter[0]; \
161 _to[1] = (to_type) _iter[1]; \
162 _to[2] = (to_type) _iter[2]; \
163 _to[3] = (to_type) _iter[3]; \
164 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200166 while (_iter < (_end)) \
167 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200168 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200169
Walter Dörwald16807132007-05-25 13:52:07 +0000170/* This dictionary holds all interned unicode strings. Note that references
171 to strings in this dictionary are *not* counted in the string's ob_refcnt.
172 When the interned string reaches a refcnt of 0 the string deallocation
173 function will delete the reference from this dictionary.
174
175 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000176 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000177*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200178static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000179
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200181static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200184 do { \
185 if (unicode_empty != NULL) \
186 Py_INCREF(unicode_empty); \
187 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 unicode_empty = PyUnicode_New(0, 0); \
189 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
192 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200193 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000195
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196#define _Py_RETURN_UNICODE_EMPTY() \
197 do { \
198 _Py_INCREF_UNICODE_EMPTY(); \
199 return unicode_empty; \
200 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200202/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200203static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200204
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205/* Single character Unicode strings in the Latin-1 range are being
206 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208
Christian Heimes190d79e2008-01-30 11:58:22 +0000209/* Fast detection of the most frequent whitespace characters */
210const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000212/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000213/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000214/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000215/* case 0x000C: * FORM FEED */
216/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000217 0, 1, 1, 1, 1, 1, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x001C: * FILE SEPARATOR */
220/* case 0x001D: * GROUP SEPARATOR */
221/* case 0x001E: * RECORD SEPARATOR */
222/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000223 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 1, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000229
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000238};
239
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200240/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200242static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100243static int unicode_modifiable(PyObject *unicode);
244
Victor Stinnerfe226c02011-10-03 03:52:20 +0200245
Alexander Belopolsky40018472011-02-26 01:02:56 +0000246static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200248static PyObject *
249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
250static PyObject *
251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
252
253static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000255 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100256 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259static void
260raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300261 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100262 PyObject *unicode,
263 Py_ssize_t startpos, Py_ssize_t endpos,
264 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000265
Christian Heimes190d79e2008-01-30 11:58:22 +0000266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270/* 0x000B, * LINE TABULATION */
271/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000273 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000275/* 0x001C, * FILE SEPARATOR */
276/* 0x001D, * GROUP SEPARATOR */
277/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000278 0, 0, 0, 0, 1, 1, 1, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000283
Benjamin Peterson14339b62009-01-31 16:36:08 +0000284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000292};
293
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300294/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
295 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000297PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000299#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 /* This is actually an illegal character, so it should
303 not be passed to unichr. */
304 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000305#endif
306}
307
Victor Stinner910337b2011-10-03 03:20:16 +0200308#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200309int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100310_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200311{
312 PyASCIIObject *ascii;
313 unsigned int kind;
314
315 assert(PyUnicode_Check(op));
316
317 ascii = (PyASCIIObject *)op;
318 kind = ascii->state.kind;
319
Victor Stinnera3b334d2011-10-03 13:53:37 +0200320 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(ascii->state.ready == 1);
323 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200325 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200326 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200327
Victor Stinnera41463c2011-10-04 01:05:08 +0200328 if (ascii->state.compact == 1) {
329 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200330 assert(kind == PyUnicode_1BYTE_KIND
331 || kind == PyUnicode_2BYTE_KIND
332 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200334 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100336 }
337 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
339
340 data = unicode->data.any;
341 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100342 assert(ascii->length == 0);
343 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ascii == 0);
346 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 assert(ascii->wstr != NULL);
349 assert(data == NULL);
350 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200351 }
352 else {
353 assert(kind == PyUnicode_1BYTE_KIND
354 || kind == PyUnicode_2BYTE_KIND
355 || kind == PyUnicode_4BYTE_KIND);
356 assert(ascii->state.compact == 0);
357 assert(ascii->state.ready == 1);
358 assert(data != NULL);
359 if (ascii->state.ascii) {
360 assert (compact->utf8 == data);
361 assert (compact->utf8_length == ascii->length);
362 }
363 else
364 assert (compact->utf8 != data);
365 }
366 }
367 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 if (
369#if SIZEOF_WCHAR_T == 2
370 kind == PyUnicode_2BYTE_KIND
371#else
372 kind == PyUnicode_4BYTE_KIND
373#endif
374 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 {
376 assert(ascii->wstr == data);
377 assert(compact->wstr_length == ascii->length);
378 } else
379 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200380 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200381
382 if (compact->utf8 == NULL)
383 assert(compact->utf8_length == 0);
384 if (ascii->wstr == NULL)
385 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 /* check that the best kind is used */
388 if (check_content && kind != PyUnicode_WCHAR_KIND)
389 {
390 Py_ssize_t i;
391 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200392 void *data;
393 Py_UCS4 ch;
394
395 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 for (i=0; i < ascii->length; i++)
397 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200398 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200399 if (ch > maxchar)
400 maxchar = ch;
401 }
402 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100403 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 255);
406 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200407 else
408 assert(maxchar < 128);
409 }
Victor Stinner77faf692011-11-20 18:56:05 +0100410 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100412 assert(maxchar <= 0xFFFF);
413 }
414 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200415 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100416 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200418 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200419 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400420 return 1;
421}
Victor Stinner910337b2011-10-03 03:20:16 +0200422#endif
423
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100424static PyObject*
425unicode_result_wchar(PyObject *unicode)
426{
427#ifndef Py_DEBUG
428 Py_ssize_t len;
429
430 assert(Py_REFCNT(unicode) == 1);
431
432 len = _PyUnicode_WSTR_LENGTH(unicode);
433 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100434 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200435 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 }
437
438 if (len == 1) {
439 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100440 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
442 Py_DECREF(unicode);
443 return latin1_char;
444 }
445 }
446
447 if (_PyUnicode_Ready(unicode) < 0) {
448 Py_XDECREF(unicode);
449 return NULL;
450 }
451#else
452 /* don't make the result ready in debug mode to ensure that the caller
453 makes the string ready before using it */
454 assert(_PyUnicode_CheckConsistency(unicode, 1));
455#endif
456 return unicode;
457}
458
459static PyObject*
460unicode_result_ready(PyObject *unicode)
461{
462 Py_ssize_t length;
463
464 length = PyUnicode_GET_LENGTH(unicode);
465 if (length == 0) {
466 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100467 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200468 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100469 }
470 return unicode_empty;
471 }
472
473 if (length == 1) {
474 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
475 if (ch < 256) {
476 PyObject *latin1_char = unicode_latin1[ch];
477 if (latin1_char != NULL) {
478 if (unicode != latin1_char) {
479 Py_INCREF(latin1_char);
480 Py_DECREF(unicode);
481 }
482 return latin1_char;
483 }
484 else {
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 Py_INCREF(unicode);
487 unicode_latin1[ch] = unicode;
488 return unicode;
489 }
490 }
491 }
492
493 assert(_PyUnicode_CheckConsistency(unicode, 1));
494 return unicode;
495}
496
497static PyObject*
498unicode_result(PyObject *unicode)
499{
500 assert(_PyUnicode_CHECK(unicode));
501 if (PyUnicode_IS_READY(unicode))
502 return unicode_result_ready(unicode);
503 else
504 return unicode_result_wchar(unicode);
505}
506
Victor Stinnerc4b49542011-12-11 22:44:26 +0100507static PyObject*
508unicode_result_unchanged(PyObject *unicode)
509{
510 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500511 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512 return NULL;
513 Py_INCREF(unicode);
514 return unicode;
515 }
516 else
517 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100518 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100519}
520
Victor Stinner3a50e702011-10-18 21:21:00 +0200521#ifdef HAVE_MBCS
522static OSVERSIONINFOEX winver;
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525/* --- Bloom Filters ----------------------------------------------------- */
526
527/* stuff to implement simple "bloom filters" for Unicode characters.
528 to keep things simple, we use a single bitmask, using the least 5
529 bits from each unicode characters as the bit index. */
530
531/* the linebreak mask is set up by Unicode_Init below */
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#if LONG_BIT >= 128
534#define BLOOM_WIDTH 128
535#elif LONG_BIT >= 64
536#define BLOOM_WIDTH 64
537#elif LONG_BIT >= 32
538#define BLOOM_WIDTH 32
539#else
540#error "LONG_BIT is smaller than 32"
541#endif
542
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543#define BLOOM_MASK unsigned long
544
Serhiy Storchaka05997252013-01-26 12:14:02 +0200545static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546
Antoine Pitrouf068f942010-01-13 14:19:12 +0000547#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
557 /* calculate simple bloom-style bitmask for a given unicode string */
558
Antoine Pitrouf068f942010-01-13 14:19:12 +0000559 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000560 Py_ssize_t i;
561
562 mask = 0;
563 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565
566 return mask;
567}
568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569#define BLOOM_MEMBER(mask, chr, str) \
570 (BLOOM(mask, chr) \
571 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200573/* Compilation of templated routines */
574
575#include "stringlib/asciilib.h"
576#include "stringlib/fastsearch.h"
577#include "stringlib/partition.h"
578#include "stringlib/split.h"
579#include "stringlib/count.h"
580#include "stringlib/find.h"
581#include "stringlib/find_max_char.h"
582#include "stringlib/localeutil.h"
583#include "stringlib/undef.h"
584
585#include "stringlib/ucs1lib.h"
586#include "stringlib/fastsearch.h"
587#include "stringlib/partition.h"
588#include "stringlib/split.h"
589#include "stringlib/count.h"
590#include "stringlib/find.h"
591#include "stringlib/find_max_char.h"
592#include "stringlib/localeutil.h"
593#include "stringlib/undef.h"
594
595#include "stringlib/ucs2lib.h"
596#include "stringlib/fastsearch.h"
597#include "stringlib/partition.h"
598#include "stringlib/split.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
601#include "stringlib/find_max_char.h"
602#include "stringlib/localeutil.h"
603#include "stringlib/undef.h"
604
605#include "stringlib/ucs4lib.h"
606#include "stringlib/fastsearch.h"
607#include "stringlib/partition.h"
608#include "stringlib/split.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200615#include "stringlib/unicodedefs.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100619#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621/* --- Unicode Object ----------------------------------------------------- */
622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200624fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200626Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
627 Py_ssize_t size, Py_UCS4 ch,
628 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200630 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
631
632 switch (kind) {
633 case PyUnicode_1BYTE_KIND:
634 {
635 Py_UCS1 ch1 = (Py_UCS1) ch;
636 if (ch1 == ch)
637 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_2BYTE_KIND:
642 {
643 Py_UCS2 ch2 = (Py_UCS2) ch;
644 if (ch2 == ch)
645 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
646 else
647 return -1;
648 }
649 case PyUnicode_4BYTE_KIND:
650 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
651 default:
652 assert(0);
653 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655}
656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657static PyObject*
658resize_compact(PyObject *unicode, Py_ssize_t length)
659{
660 Py_ssize_t char_size;
661 Py_ssize_t struct_size;
662 Py_ssize_t new_size;
663 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100664 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200665 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100667 assert(PyUnicode_IS_COMPACT(unicode));
668
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200669 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100670 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 struct_size = sizeof(PyASCIIObject);
672 else
673 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200674 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
677 PyErr_NoMemory();
678 return NULL;
679 }
680 new_size = (struct_size + (length + 1) * char_size);
681
Victor Stinner84def372011-12-11 20:04:56 +0100682 _Py_DEC_REFTOTAL;
683 _Py_ForgetReference(unicode);
684
685 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
686 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100687 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 PyErr_NoMemory();
689 return NULL;
690 }
Victor Stinner84def372011-12-11 20:04:56 +0100691 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200695 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100697 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200698 _PyUnicode_WSTR_LENGTH(unicode) = length;
699 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100700 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
701 PyObject_DEL(_PyUnicode_WSTR(unicode));
702 _PyUnicode_WSTR(unicode) = NULL;
703 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
705 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200706 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 return unicode;
708}
709
Alexander Belopolsky40018472011-02-26 01:02:56 +0000710static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200711resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712{
Victor Stinner95663112011-10-04 01:03:50 +0200713 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100714 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000717
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718 if (PyUnicode_IS_READY(unicode)) {
719 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200720 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 void *data;
722
723 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200724 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
726 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727
728 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
729 PyErr_NoMemory();
730 return -1;
731 }
732 new_size = (length + 1) * char_size;
733
Victor Stinner7a9105a2011-12-12 00:13:42 +0100734 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
735 {
736 PyObject_DEL(_PyUnicode_UTF8(unicode));
737 _PyUnicode_UTF8(unicode) = NULL;
738 _PyUnicode_UTF8_LENGTH(unicode) = 0;
739 }
740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 data = (PyObject *)PyObject_REALLOC(data, new_size);
742 if (data == NULL) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200747 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200749 _PyUnicode_WSTR_LENGTH(unicode) = length;
750 }
751 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200752 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200753 _PyUnicode_UTF8_LENGTH(unicode) = length;
754 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 _PyUnicode_LENGTH(unicode) = length;
756 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200757 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200758 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 }
Victor Stinner95663112011-10-04 01:03:50 +0200762 assert(_PyUnicode_WSTR(unicode) != NULL);
763
764 /* check for integer overflow */
765 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
766 PyErr_NoMemory();
767 return -1;
768 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200770 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200772 if (!wstr) {
773 PyErr_NoMemory();
774 return -1;
775 }
776 _PyUnicode_WSTR(unicode) = wstr;
777 _PyUnicode_WSTR(unicode)[length] = 0;
778 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200779 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 return 0;
781}
782
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783static PyObject*
784resize_copy(PyObject *unicode, Py_ssize_t length)
785{
786 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100787 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100789
Benjamin Petersonbac79492012-01-14 13:34:47 -0500790 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792
793 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
794 if (copy == NULL)
795 return NULL;
796
797 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200798 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200800 }
801 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200802 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200804 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200805 if (w == NULL)
806 return NULL;
807 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
808 copy_length = Py_MIN(copy_length, length);
809 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
810 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200811 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
813}
814
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000816 Ux0000 terminated; some code (e.g. new_identifier)
817 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818
819 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000820 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
822*/
823
Alexander Belopolsky40018472011-02-26 01:02:56 +0000824static PyUnicodeObject *
825_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826{
827 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831 if (length == 0 && unicode_empty != NULL) {
832 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200833 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 }
835
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000836 /* Ensure we won't overflow the size. */
837 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
838 return (PyUnicodeObject *)PyErr_NoMemory();
839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 if (length < 0) {
841 PyErr_SetString(PyExc_SystemError,
842 "Negative size passed to _PyUnicode_New");
843 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000844 }
845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
847 if (unicode == NULL)
848 return NULL;
849 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
850 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
851 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100852 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000853 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100854 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856
Jeremy Hyltond8082792003-09-16 19:41:39 +0000857 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000858 * the caller fails before initializing str -- unicode_resize()
859 * reads str[0], and the Keep-Alive optimization can keep memory
860 * allocated for str alive across a call to unicode_dealloc(unicode).
861 * We don't want unicode_resize to read uninitialized memory in
862 * that case.
863 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864 _PyUnicode_WSTR(unicode)[0] = 0;
865 _PyUnicode_WSTR(unicode)[length] = 0;
866 _PyUnicode_WSTR_LENGTH(unicode) = length;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = 0;
870 _PyUnicode_STATE(unicode).compact = 0;
871 _PyUnicode_STATE(unicode).ready = 0;
872 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200873 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200875 _PyUnicode_UTF8(unicode) = NULL;
876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100877 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878 return unicode;
879}
880
Victor Stinnerf42dc442011-10-02 23:33:16 +0200881static const char*
882unicode_kind_name(PyObject *unicode)
883{
Victor Stinner42dfd712011-10-03 14:41:45 +0200884 /* don't check consistency: unicode_kind_name() is called from
885 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886 if (!PyUnicode_IS_COMPACT(unicode))
887 {
888 if (!PyUnicode_IS_READY(unicode))
889 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600890 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "legacy ascii";
895 else
896 return "legacy latin1";
897 case PyUnicode_2BYTE_KIND:
898 return "legacy UCS2";
899 case PyUnicode_4BYTE_KIND:
900 return "legacy UCS4";
901 default:
902 return "<legacy invalid kind>";
903 }
904 }
905 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600906 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200907 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200908 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200909 return "ascii";
910 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200913 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200914 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200915 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200916 default:
917 return "<invalid compact kind>";
918 }
919}
920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922/* Functions wrapping macros for use in debugger */
923char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200924 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925}
926
927void *_PyUnicode_compact_data(void *unicode) {
928 return _PyUnicode_COMPACT_DATA(unicode);
929}
930void *_PyUnicode_data(void *unicode){
931 printf("obj %p\n", unicode);
932 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
933 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
934 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
935 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
936 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
937 return PyUnicode_DATA(unicode);
938}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200939
940void
941_PyUnicode_Dump(PyObject *op)
942{
943 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
945 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
946 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200947
Victor Stinnera849a4b2011-10-03 12:12:11 +0200948 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200949 {
950 if (ascii->state.ascii)
951 data = (ascii + 1);
952 else
953 data = (compact + 1);
954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 else
956 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200957 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
958
Victor Stinnera849a4b2011-10-03 12:12:11 +0200959 if (ascii->wstr == data)
960 printf("shared ");
961 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200962
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200964 printf(" (%zu), ", compact->wstr_length);
965 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
966 printf("shared ");
967 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200970}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971#endif
972
973PyObject *
974PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
975{
976 PyObject *obj;
977 PyCompactUnicodeObject *unicode;
978 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200979 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981 Py_ssize_t char_size;
982 Py_ssize_t struct_size;
983
984 /* Optimization for empty strings */
985 if (size == 0 && unicode_empty != NULL) {
986 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200987 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 }
989
Victor Stinner9e9d6892011-10-04 01:02:02 +0200990 is_ascii = 0;
991 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 struct_size = sizeof(PyCompactUnicodeObject);
993 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200994 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 char_size = 1;
996 is_ascii = 1;
997 struct_size = sizeof(PyASCIIObject);
998 }
999 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 }
1003 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001004 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 char_size = 2;
1006 if (sizeof(wchar_t) == 2)
1007 is_sharing = 1;
1008 }
1009 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001010 if (maxchar > MAX_UNICODE) {
1011 PyErr_SetString(PyExc_SystemError,
1012 "invalid maximum character passed to PyUnicode_New");
1013 return NULL;
1014 }
Victor Stinner8f825062012-04-27 13:55:39 +02001015 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 char_size = 4;
1017 if (sizeof(wchar_t) == 4)
1018 is_sharing = 1;
1019 }
1020
1021 /* Ensure we won't overflow the size. */
1022 if (size < 0) {
1023 PyErr_SetString(PyExc_SystemError,
1024 "Negative size passed to PyUnicode_New");
1025 return NULL;
1026 }
1027 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1028 return PyErr_NoMemory();
1029
1030 /* Duplicated allocation code from _PyObject_New() instead of a call to
1031 * PyObject_New() so we are able to allocate space for the object and
1032 * it's data buffer.
1033 */
1034 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1035 if (obj == NULL)
1036 return PyErr_NoMemory();
1037 obj = PyObject_INIT(obj, &PyUnicode_Type);
1038 if (obj == NULL)
1039 return NULL;
1040
1041 unicode = (PyCompactUnicodeObject *)obj;
1042 if (is_ascii)
1043 data = ((PyASCIIObject*)obj) + 1;
1044 else
1045 data = unicode + 1;
1046 _PyUnicode_LENGTH(unicode) = size;
1047 _PyUnicode_HASH(unicode) = -1;
1048 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 _PyUnicode_STATE(unicode).compact = 1;
1051 _PyUnicode_STATE(unicode).ready = 1;
1052 _PyUnicode_STATE(unicode).ascii = is_ascii;
1053 if (is_ascii) {
1054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 }
Victor Stinner8f825062012-04-27 13:55:39 +02001057 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 ((char*)data)[size] = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 else {
1065 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001066 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001067 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS4*)data)[size] = 0;
1071 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 }
1075 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 }
1079 }
Victor Stinner8f825062012-04-27 13:55:39 +02001080#ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier.
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 return obj;
1089}
1090
1091#if SIZEOF_WCHAR_T == 2
1092/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001094 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095
1096 This function assumes that unicode can hold one more code point than wstr
1097 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001098static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001100 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
1102 const wchar_t *iter;
1103 Py_UCS4 *ucs4_out;
1104
Victor Stinner910337b2011-10-03 03:20:16 +02001105 assert(unicode != NULL);
1106 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1108 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1109
1110 for (iter = begin; iter < end; ) {
1111 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001113 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1114 && (iter+1) < end
1115 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 {
Victor Stinner551ac952011-11-29 22:58:13 +01001117 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 iter += 2;
1119 }
1120 else {
1121 *ucs4_out++ = *iter;
1122 iter++;
1123 }
1124 }
1125 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1126 _PyUnicode_GET_LENGTH(unicode)));
1127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128}
1129#endif
1130
Victor Stinnercd9950f2011-10-02 00:34:53 +02001131static int
Victor Stinner488fa492011-12-12 00:01:39 +01001132unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133{
Victor Stinner488fa492011-12-12 00:01:39 +01001134 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001135 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001136 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001137 return -1;
1138 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return 0;
1140}
1141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142static int
1143_copy_characters(PyObject *to, Py_ssize_t to_start,
1144 PyObject *from, Py_ssize_t from_start,
1145 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 unsigned int from_kind, to_kind;
1148 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149
Victor Stinneree4544c2012-05-09 22:24:08 +02001150 assert(0 <= how_many);
1151 assert(0 <= from_start);
1152 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001155 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156
Victor Stinnerd3f08822012-05-29 12:57:52 +02001157 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(to));
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1160
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001161 if (how_many == 0)
1162 return 0;
1163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168
Victor Stinnerf1852262012-06-16 16:38:26 +02001169#ifdef Py_DEBUG
1170 if (!check_maxchar
1171 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1172 {
1173 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1174 Py_UCS4 ch;
1175 Py_ssize_t i;
1176 for (i=0; i < how_many; i++) {
1177 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1178 assert(ch <= to_maxchar);
1179 }
1180 }
1181#endif
1182
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001183 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001184 if (check_maxchar
1185 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1186 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001187 /* Writing Latin-1 characters into an ASCII string requires to
1188 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001189 Py_UCS4 max_char;
1190 max_char = ucs1lib_find_max_char(from_data,
1191 (Py_UCS1*)from_data + how_many);
1192 if (max_char >= 128)
1193 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001194 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001195 Py_MEMCPY((char*)to_data + to_kind * to_start,
1196 (char*)from_data + from_kind * from_start,
1197 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else if (from_kind == PyUnicode_1BYTE_KIND
1200 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001201 {
1202 _PyUnicode_CONVERT_BYTES(
1203 Py_UCS1, Py_UCS2,
1204 PyUnicode_1BYTE_DATA(from) + from_start,
1205 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1206 PyUnicode_2BYTE_DATA(to) + to_start
1207 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 && to_kind == PyUnicode_4BYTE_KIND)
1211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS4,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_4BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
1219 else if (from_kind == PyUnicode_2BYTE_KIND
1220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS2, Py_UCS4,
1224 PyUnicode_2BYTE_DATA(from) + from_start,
1225 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001229 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001230 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1231
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001232 if (!check_maxchar) {
1233 if (from_kind == PyUnicode_2BYTE_KIND
1234 && to_kind == PyUnicode_1BYTE_KIND)
1235 {
1236 _PyUnicode_CONVERT_BYTES(
1237 Py_UCS2, Py_UCS1,
1238 PyUnicode_2BYTE_DATA(from) + from_start,
1239 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1240 PyUnicode_1BYTE_DATA(to) + to_start
1241 );
1242 }
1243 else if (from_kind == PyUnicode_4BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS4, Py_UCS1,
1248 PyUnicode_4BYTE_DATA(from) + from_start,
1249 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_2BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS2,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_2BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else {
1264 assert(0);
1265 return -1;
1266 }
1267 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001268 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001269 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001271 Py_ssize_t i;
1272
Victor Stinnera0702ab2011-09-29 14:14:38 +02001273 for (i=0; i < how_many; i++) {
1274 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001275 if (ch > to_maxchar)
1276 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001277 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1278 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 }
1280 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001281 return 0;
1282}
1283
Victor Stinnerd3f08822012-05-29 12:57:52 +02001284void
1285_PyUnicode_FastCopyCharacters(
1286 PyObject *to, Py_ssize_t to_start,
1287 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288{
1289 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1290}
1291
1292Py_ssize_t
1293PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1294 PyObject *from, Py_ssize_t from_start,
1295 Py_ssize_t how_many)
1296{
1297 int err;
1298
1299 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1300 PyErr_BadInternalCall();
1301 return -1;
1302 }
1303
Benjamin Petersonbac79492012-01-14 13:34:47 -05001304 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001305 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001306 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
1308
Victor Stinnerd3f08822012-05-29 12:57:52 +02001309 if (from_start < 0) {
1310 PyErr_SetString(PyExc_IndexError, "string index out of range");
1311 return -1;
1312 }
1313 if (to_start < 0) {
1314 PyErr_SetString(PyExc_IndexError, "string index out of range");
1315 return -1;
1316 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1318 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1319 PyErr_Format(PyExc_SystemError,
1320 "Cannot write %zi characters at %zi "
1321 "in a string of %zi characters",
1322 how_many, to_start, PyUnicode_GET_LENGTH(to));
1323 return -1;
1324 }
1325
1326 if (how_many == 0)
1327 return 0;
1328
Victor Stinner488fa492011-12-12 00:01:39 +01001329 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 return -1;
1331
1332 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1333 if (err) {
1334 PyErr_Format(PyExc_SystemError,
1335 "Cannot copy %s characters "
1336 "into a string of %s characters",
1337 unicode_kind_name(from),
1338 unicode_kind_name(to));
1339 return -1;
1340 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001341 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342}
1343
Victor Stinner17222162011-09-28 22:15:37 +02001344/* Find the maximum code point and count the number of surrogate pairs so a
1345 correct string length can be computed before converting a string to UCS4.
1346 This function counts single surrogates as a character and not as a pair.
1347
1348 Return 0 on success, or -1 on error. */
1349static int
1350find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1351 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352{
1353 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001354 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001355
Victor Stinnerc53be962011-10-02 21:33:54 +02001356 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 *num_surrogates = 0;
1358 *maxchar = 0;
1359
1360 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001362 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1363 && (iter+1) < end
1364 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001366 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 iter += 2;
1369 }
1370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001372 {
1373 ch = *iter;
1374 iter++;
1375 }
1376 if (ch > *maxchar) {
1377 *maxchar = ch;
1378 if (*maxchar > MAX_UNICODE) {
1379 PyErr_Format(PyExc_ValueError,
1380 "character U+%x is not in range [U+0000; U+10ffff]",
1381 ch);
1382 return -1;
1383 }
1384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 return 0;
1387}
1388
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001389int
1390_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391{
1392 wchar_t *end;
1393 Py_UCS4 maxchar = 0;
1394 Py_ssize_t num_surrogates;
1395#if SIZEOF_WCHAR_T == 2
1396 Py_ssize_t length_wo_surrogates;
1397#endif
1398
Georg Brandl7597add2011-10-05 16:36:47 +02001399 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001400 strings were created using _PyObject_New() and where no canonical
1401 representation (the str field) has been set yet aka strings
1402 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001403 assert(_PyUnicode_CHECK(unicode));
1404 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001406 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001407 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001408 /* Actually, it should neither be interned nor be anything else: */
1409 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001412 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001413 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
1416 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1418 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 PyErr_NoMemory();
1420 return -1;
1421 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001422 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 _PyUnicode_WSTR(unicode), end,
1424 PyUnicode_1BYTE_DATA(unicode));
1425 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1426 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1427 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1428 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001429 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001430 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001432 }
1433 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001434 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001435 _PyUnicode_UTF8(unicode) = NULL;
1436 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 PyObject_FREE(_PyUnicode_WSTR(unicode));
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1441 }
1442 /* In this case we might have to convert down from 4-byte native
1443 wchar_t to 2-byte unicode. */
1444 else if (maxchar < 65536) {
1445 assert(num_surrogates == 0 &&
1446 "FindMaxCharAndNumSurrogatePairs() messed up");
1447
Victor Stinner506f5922011-09-28 22:34:18 +02001448#if SIZEOF_WCHAR_T == 2
1449 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001450 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001451 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1452 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1453 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001454 _PyUnicode_UTF8(unicode) = NULL;
1455 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001456#else
1457 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001459 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyErr_NoMemory();
1462 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 }
Victor Stinner506f5922011-09-28 22:34:18 +02001464 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1465 _PyUnicode_WSTR(unicode), end,
1466 PyUnicode_2BYTE_DATA(unicode));
1467 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1468 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1469 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001470 _PyUnicode_UTF8(unicode) = NULL;
1471 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001472 PyObject_FREE(_PyUnicode_WSTR(unicode));
1473 _PyUnicode_WSTR(unicode) = NULL;
1474 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1475#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 }
1477 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1478 else {
1479#if SIZEOF_WCHAR_T == 2
1480 /* in case the native representation is 2-bytes, we need to allocate a
1481 new normalized 4-byte version. */
1482 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1484 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 PyErr_NoMemory();
1486 return -1;
1487 }
1488 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1489 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001492 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1493 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001494 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyObject_FREE(_PyUnicode_WSTR(unicode));
1496 _PyUnicode_WSTR(unicode) = NULL;
1497 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1498#else
1499 assert(num_surrogates == 0);
1500
Victor Stinnerc3c74152011-10-02 20:39:55 +02001501 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001503 _PyUnicode_UTF8(unicode) = NULL;
1504 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1506#endif
1507 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1508 }
1509 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001510 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 return 0;
1512}
1513
Alexander Belopolsky40018472011-02-26 01:02:56 +00001514static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001515unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516{
Walter Dörwald16807132007-05-25 13:52:07 +00001517 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 case SSTATE_NOT_INTERNED:
1519 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001520
Benjamin Peterson29060642009-01-31 22:14:21 +00001521 case SSTATE_INTERNED_MORTAL:
1522 /* revive dead object temporarily for DelItem */
1523 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001524 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001525 Py_FatalError(
1526 "deletion of interned string failed");
1527 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001528
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 case SSTATE_INTERNED_IMMORTAL:
1530 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001531
Benjamin Peterson29060642009-01-31 22:14:21 +00001532 default:
1533 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001534 }
1535
Victor Stinner03490912011-10-03 23:45:12 +02001536 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001538 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001539 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001540 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1541 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001543 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544}
1545
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546#ifdef Py_DEBUG
1547static int
1548unicode_is_singleton(PyObject *unicode)
1549{
1550 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1551 if (unicode == unicode_empty)
1552 return 1;
1553 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1554 {
1555 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1556 if (ch < 256 && unicode_latin1[ch] == unicode)
1557 return 1;
1558 }
1559 return 0;
1560}
1561#endif
1562
Alexander Belopolsky40018472011-02-26 01:02:56 +00001563static int
Victor Stinner488fa492011-12-12 00:01:39 +01001564unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001565{
Victor Stinner488fa492011-12-12 00:01:39 +01001566 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567 if (Py_REFCNT(unicode) != 1)
1568 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001569 if (_PyUnicode_HASH(unicode) != -1)
1570 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001571 if (PyUnicode_CHECK_INTERNED(unicode))
1572 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001573 if (!PyUnicode_CheckExact(unicode))
1574 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001575#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001576 /* singleton refcount is greater than 1 */
1577 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001578#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 return 1;
1580}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001581
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582static int
1583unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1584{
1585 PyObject *unicode;
1586 Py_ssize_t old_length;
1587
1588 assert(p_unicode != NULL);
1589 unicode = *p_unicode;
1590
1591 assert(unicode != NULL);
1592 assert(PyUnicode_Check(unicode));
1593 assert(0 <= length);
1594
Victor Stinner910337b2011-10-03 03:20:16 +02001595 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 old_length = PyUnicode_WSTR_LENGTH(unicode);
1597 else
1598 old_length = PyUnicode_GET_LENGTH(unicode);
1599 if (old_length == length)
1600 return 0;
1601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001603 _Py_INCREF_UNICODE_EMPTY();
1604 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606 Py_DECREF(*p_unicode);
1607 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608 return 0;
1609 }
1610
Victor Stinner488fa492011-12-12 00:01:39 +01001611 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001612 PyObject *copy = resize_copy(unicode, length);
1613 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001614 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 Py_DECREF(*p_unicode);
1616 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618 }
1619
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001621 PyObject *new_unicode = resize_compact(unicode, length);
1622 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001624 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001626 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001627 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628}
1629
Alexander Belopolsky40018472011-02-26 01:02:56 +00001630int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001632{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 PyObject *unicode;
1634 if (p_unicode == NULL) {
1635 PyErr_BadInternalCall();
1636 return -1;
1637 }
1638 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 {
1641 PyErr_BadInternalCall();
1642 return -1;
1643 }
1644 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001645}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001648unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1649 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001650{
1651 PyObject *result;
1652 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1655 return 0;
1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1657 maxchar);
1658 if (result == NULL)
1659 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = result;
1663 return 0;
1664}
1665
1666static int
1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1668 Py_UCS4 ch)
1669{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001670 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001671 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001672 return -1;
1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1674 PyUnicode_DATA(*p_unicode),
1675 (*pos)++, ch);
1676 return 0;
1677}
1678
Victor Stinnerc5166102012-02-22 13:55:02 +01001679/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001680
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001681 WARNING: The function doesn't copy the terminating null character and
1682 doesn't check the maximum character (may write a latin1 character in an
1683 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001684static void
1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1686 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001687{
1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1689 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001690 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001691
1692 switch (kind) {
1693 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001694 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001695 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001696 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001697 }
1698 case PyUnicode_2BYTE_KIND: {
1699 Py_UCS2 *start = (Py_UCS2 *)data + index;
1700 Py_UCS2 *ucs2 = start;
1701 assert(index <= PyUnicode_GET_LENGTH(unicode));
1702
Victor Stinner184252a2012-06-16 02:57:41 +02001703 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001704 *ucs2 = (Py_UCS2)*str;
1705
1706 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001707 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001708 }
1709 default: {
1710 Py_UCS4 *start = (Py_UCS4 *)data + index;
1711 Py_UCS4 *ucs4 = start;
1712 assert(kind == PyUnicode_4BYTE_KIND);
1713 assert(index <= PyUnicode_GET_LENGTH(unicode));
1714
Victor Stinner184252a2012-06-16 02:57:41 +02001715 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001716 *ucs4 = (Py_UCS4)*str;
1717
1718 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 }
1720 }
1721}
1722
1723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724static PyObject*
1725get_latin1_char(unsigned char ch)
1726{
Victor Stinnera464fc12011-10-02 20:39:30 +02001727 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001729 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730 if (!unicode)
1731 return NULL;
1732 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001733 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 unicode_latin1[ch] = unicode;
1735 }
1736 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001737 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738}
1739
Alexander Belopolsky40018472011-02-26 01:02:56 +00001740PyObject *
1741PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001743 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 Py_UCS4 maxchar = 0;
1745 Py_ssize_t num_surrogates;
1746
1747 if (u == NULL)
1748 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001750 /* If the Unicode data is known at construction time, we can apply
1751 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001754 if (size == 0)
1755 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 /* Single character Unicode objects in the Latin-1 range are
1758 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001759 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return get_latin1_char((unsigned char)*u);
1761
1762 /* If not empty and not single character, copy the Unicode data
1763 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001764 if (find_maxchar_surrogates(u, u + size,
1765 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 return NULL;
1767
Victor Stinner8faf8212011-12-08 22:14:11 +01001768 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 if (!unicode)
1770 return NULL;
1771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 switch (PyUnicode_KIND(unicode)) {
1773 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001774 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1776 break;
1777 case PyUnicode_2BYTE_KIND:
1778#if Py_UNICODE_SIZE == 2
1779 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1780#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001781 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1783#endif
1784 break;
1785 case PyUnicode_4BYTE_KIND:
1786#if SIZEOF_WCHAR_T == 2
1787 /* This is the only case which has to process surrogates, thus
1788 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001789 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790#else
1791 assert(num_surrogates == 0);
1792 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1793#endif
1794 break;
1795 default:
1796 assert(0 && "Impossible state");
1797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001799 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800}
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802PyObject *
1803PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001804{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001805 if (size < 0) {
1806 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001807 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 return NULL;
1809 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001810 if (u != NULL)
1811 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1812 else
1813 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001814}
1815
Alexander Belopolsky40018472011-02-26 01:02:56 +00001816PyObject *
1817PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001818{
1819 size_t size = strlen(u);
1820 if (size > PY_SSIZE_T_MAX) {
1821 PyErr_SetString(PyExc_OverflowError, "input too long");
1822 return NULL;
1823 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001824 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001825}
1826
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001827PyObject *
1828_PyUnicode_FromId(_Py_Identifier *id)
1829{
1830 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001831 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1832 strlen(id->string),
1833 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001834 if (!id->object)
1835 return NULL;
1836 PyUnicode_InternInPlace(&id->object);
1837 assert(!id->next);
1838 id->next = static_strings;
1839 static_strings = id;
1840 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001841 return id->object;
1842}
1843
1844void
1845_PyUnicode_ClearStaticStrings()
1846{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001847 _Py_Identifier *tmp, *s = static_strings;
1848 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001849 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001850 tmp = s->next;
1851 s->next = NULL;
1852 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001853 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001854 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855}
1856
Benjamin Peterson0df54292012-03-26 14:50:32 -04001857/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001858
Victor Stinnerd3f08822012-05-29 12:57:52 +02001859PyObject*
1860_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001861{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001862 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001863 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001864 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001865#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001866 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001867#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001868 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001869 }
Victor Stinner785938e2011-12-11 20:09:03 +01001870 unicode = PyUnicode_New(size, 127);
1871 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001872 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001873 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1874 assert(_PyUnicode_CheckConsistency(unicode, 1));
1875 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001876}
1877
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001878static Py_UCS4
1879kind_maxchar_limit(unsigned int kind)
1880{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001881 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001882 case PyUnicode_1BYTE_KIND:
1883 return 0x80;
1884 case PyUnicode_2BYTE_KIND:
1885 return 0x100;
1886 case PyUnicode_4BYTE_KIND:
1887 return 0x10000;
1888 default:
1889 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001890 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001891 }
1892}
1893
Victor Stinnere6abb482012-05-02 01:15:40 +02001894Py_LOCAL_INLINE(Py_UCS4)
1895align_maxchar(Py_UCS4 maxchar)
1896{
1897 if (maxchar <= 127)
1898 return 127;
1899 else if (maxchar <= 255)
1900 return 255;
1901 else if (maxchar <= 65535)
1902 return 65535;
1903 else
1904 return MAX_UNICODE;
1905}
1906
Victor Stinner702c7342011-10-05 13:50:52 +02001907static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001908_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001911 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001912
Serhiy Storchaka678db842013-01-26 12:16:36 +02001913 if (size == 0)
1914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001916 if (size == 1)
1917 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001919 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 if (!res)
1922 return NULL;
1923 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001924 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001926}
1927
Victor Stinnere57b1c02011-09-28 22:20:48 +02001928static PyObject*
1929_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930{
1931 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001932 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001933
Serhiy Storchaka678db842013-01-26 12:16:36 +02001934 if (size == 0)
1935 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001937 if (size == 1) {
1938 Py_UCS4 ch = u[0];
1939 if (ch < 256)
1940 return get_latin1_char((unsigned char)ch);
1941
1942 res = PyUnicode_New(1, ch);
1943 if (res == NULL)
1944 return NULL;
1945 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1946 assert(_PyUnicode_CheckConsistency(res, 1));
1947 return res;
1948 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001949
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001950 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001952 if (!res)
1953 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 else {
1957 _PyUnicode_CONVERT_BYTES(
1958 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1959 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001960 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 return res;
1962}
1963
Victor Stinnere57b1c02011-09-28 22:20:48 +02001964static PyObject*
1965_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966{
1967 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001968 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969
Serhiy Storchaka678db842013-01-26 12:16:36 +02001970 if (size == 0)
1971 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001972 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001973 if (size == 1) {
1974 Py_UCS4 ch = u[0];
1975 if (ch < 256)
1976 return get_latin1_char((unsigned char)ch);
1977
1978 res = PyUnicode_New(1, ch);
1979 if (res == NULL)
1980 return NULL;
1981 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1982 assert(_PyUnicode_CheckConsistency(res, 1));
1983 return res;
1984 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001985
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001986 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 if (!res)
1989 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001990 if (max_char < 256)
1991 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1992 PyUnicode_1BYTE_DATA(res));
1993 else if (max_char < 0x10000)
1994 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1995 PyUnicode_2BYTE_DATA(res));
1996 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001998 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 return res;
2000}
2001
2002PyObject*
2003PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2004{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002005 if (size < 0) {
2006 PyErr_SetString(PyExc_ValueError, "size must be positive");
2007 return NULL;
2008 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002009 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002011 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002015 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002016 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017 PyErr_SetString(PyExc_SystemError, "invalid kind");
2018 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020}
2021
Victor Stinnerece58de2012-04-23 23:36:38 +02002022Py_UCS4
2023_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2024{
2025 enum PyUnicode_Kind kind;
2026 void *startptr, *endptr;
2027
2028 assert(PyUnicode_IS_READY(unicode));
2029 assert(0 <= start);
2030 assert(end <= PyUnicode_GET_LENGTH(unicode));
2031 assert(start <= end);
2032
2033 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2034 return PyUnicode_MAX_CHAR_VALUE(unicode);
2035
2036 if (start == end)
2037 return 127;
2038
Victor Stinner94d558b2012-04-27 22:26:58 +02002039 if (PyUnicode_IS_ASCII(unicode))
2040 return 127;
2041
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002043 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002044 endptr = (char *)startptr + end * kind;
2045 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002046 switch(kind) {
2047 case PyUnicode_1BYTE_KIND:
2048 return ucs1lib_find_max_char(startptr, endptr);
2049 case PyUnicode_2BYTE_KIND:
2050 return ucs2lib_find_max_char(startptr, endptr);
2051 case PyUnicode_4BYTE_KIND:
2052 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002053 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002054 assert(0);
2055 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002056 }
2057}
2058
Victor Stinner25a4b292011-10-06 12:31:55 +02002059/* Ensure that a string uses the most efficient storage, if it is not the
2060 case: create a new string with of the right kind. Write NULL into *p_unicode
2061 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002062static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002063unicode_adjust_maxchar(PyObject **p_unicode)
2064{
2065 PyObject *unicode, *copy;
2066 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002067 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002068 unsigned int kind;
2069
2070 assert(p_unicode != NULL);
2071 unicode = *p_unicode;
2072 assert(PyUnicode_IS_READY(unicode));
2073 if (PyUnicode_IS_ASCII(unicode))
2074 return;
2075
2076 len = PyUnicode_GET_LENGTH(unicode);
2077 kind = PyUnicode_KIND(unicode);
2078 if (kind == PyUnicode_1BYTE_KIND) {
2079 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 max_char = ucs1lib_find_max_char(u, u + len);
2081 if (max_char >= 128)
2082 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 }
2084 else if (kind == PyUnicode_2BYTE_KIND) {
2085 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002086 max_char = ucs2lib_find_max_char(u, u + len);
2087 if (max_char >= 256)
2088 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002089 }
2090 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002091 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002092 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 max_char = ucs4lib_find_max_char(u, u + len);
2094 if (max_char >= 0x10000)
2095 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002096 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002097 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002098 if (copy != NULL)
2099 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002100 Py_DECREF(unicode);
2101 *p_unicode = copy;
2102}
2103
Victor Stinner034f6cf2011-09-30 02:26:44 +02002104PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002105_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002106{
Victor Stinner87af4f22011-11-21 23:03:47 +01002107 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002109
Victor Stinner034f6cf2011-09-30 02:26:44 +02002110 if (!PyUnicode_Check(unicode)) {
2111 PyErr_BadInternalCall();
2112 return NULL;
2113 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002114 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002115 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002116
Victor Stinner87af4f22011-11-21 23:03:47 +01002117 length = PyUnicode_GET_LENGTH(unicode);
2118 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002119 if (!copy)
2120 return NULL;
2121 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2122
Victor Stinner87af4f22011-11-21 23:03:47 +01002123 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2124 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002125 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002126 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129
Victor Stinnerbc603d12011-10-02 01:00:40 +02002130/* Widen Unicode objects to larger buffers. Don't write terminating null
2131 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132
2133void*
2134_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2135{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136 Py_ssize_t len;
2137 void *result;
2138 unsigned int skind;
2139
Benjamin Petersonbac79492012-01-14 13:34:47 -05002140 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002141 return NULL;
2142
2143 len = PyUnicode_GET_LENGTH(s);
2144 skind = PyUnicode_KIND(s);
2145 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002146 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 return NULL;
2148 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002149 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 case PyUnicode_2BYTE_KIND:
2151 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2152 if (!result)
2153 return PyErr_NoMemory();
2154 assert(skind == PyUnicode_1BYTE_KIND);
2155 _PyUnicode_CONVERT_BYTES(
2156 Py_UCS1, Py_UCS2,
2157 PyUnicode_1BYTE_DATA(s),
2158 PyUnicode_1BYTE_DATA(s) + len,
2159 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002161 case PyUnicode_4BYTE_KIND:
2162 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2163 if (!result)
2164 return PyErr_NoMemory();
2165 if (skind == PyUnicode_2BYTE_KIND) {
2166 _PyUnicode_CONVERT_BYTES(
2167 Py_UCS2, Py_UCS4,
2168 PyUnicode_2BYTE_DATA(s),
2169 PyUnicode_2BYTE_DATA(s) + len,
2170 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 else {
2173 assert(skind == PyUnicode_1BYTE_KIND);
2174 _PyUnicode_CONVERT_BYTES(
2175 Py_UCS1, Py_UCS4,
2176 PyUnicode_1BYTE_DATA(s),
2177 PyUnicode_1BYTE_DATA(s) + len,
2178 result);
2179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 default:
2182 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 }
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186}
2187
2188static Py_UCS4*
2189as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2190 int copy_null)
2191{
2192 int kind;
2193 void *data;
2194 Py_ssize_t len, targetlen;
2195 if (PyUnicode_READY(string) == -1)
2196 return NULL;
2197 kind = PyUnicode_KIND(string);
2198 data = PyUnicode_DATA(string);
2199 len = PyUnicode_GET_LENGTH(string);
2200 targetlen = len;
2201 if (copy_null)
2202 targetlen++;
2203 if (!target) {
2204 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2205 PyErr_NoMemory();
2206 return NULL;
2207 }
2208 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2209 if (!target) {
2210 PyErr_NoMemory();
2211 return NULL;
2212 }
2213 }
2214 else {
2215 if (targetsize < targetlen) {
2216 PyErr_Format(PyExc_SystemError,
2217 "string is longer than the buffer");
2218 if (copy_null && 0 < targetsize)
2219 target[0] = 0;
2220 return NULL;
2221 }
2222 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002223 if (kind == PyUnicode_1BYTE_KIND) {
2224 Py_UCS1 *start = (Py_UCS1 *) data;
2225 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002227 else if (kind == PyUnicode_2BYTE_KIND) {
2228 Py_UCS2 *start = (Py_UCS2 *) data;
2229 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2230 }
2231 else {
2232 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 if (copy_null)
2236 target[len] = 0;
2237 return target;
2238}
2239
2240Py_UCS4*
2241PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2242 int copy_null)
2243{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002244 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 PyErr_BadInternalCall();
2246 return NULL;
2247 }
2248 return as_ucs4(string, target, targetsize, copy_null);
2249}
2250
2251Py_UCS4*
2252PyUnicode_AsUCS4Copy(PyObject *string)
2253{
2254 return as_ucs4(string, NULL, 0, 1);
2255}
2256
2257#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002258
Alexander Belopolsky40018472011-02-26 01:02:56 +00002259PyObject *
2260PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002264 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002265 PyErr_BadInternalCall();
2266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 }
2268
Martin v. Löwis790465f2008-04-05 20:41:37 +00002269 if (size == -1) {
2270 size = wcslen(w);
2271 }
2272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002277
Walter Dörwald346737f2007-05-31 10:44:43 +00002278static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002279makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2280 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002281{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 *fmt++ = '%';
2283 if (width) {
2284 if (zeropad)
2285 *fmt++ = '0';
2286 fmt += sprintf(fmt, "%d", width);
2287 }
2288 if (precision)
2289 fmt += sprintf(fmt, ".%d", precision);
2290 if (longflag)
2291 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002292 else if (longlongflag) {
2293 /* longlongflag should only ever be nonzero on machines with
2294 HAVE_LONG_LONG defined */
2295#ifdef HAVE_LONG_LONG
2296 char *f = PY_FORMAT_LONG_LONG;
2297 while (*f)
2298 *fmt++ = *f++;
2299#else
2300 /* we shouldn't ever get here */
2301 assert(0);
2302 *fmt++ = 'l';
2303#endif
2304 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002305 else if (size_tflag) {
2306 char *f = PY_FORMAT_SIZE_T;
2307 while (*f)
2308 *fmt++ = *f++;
2309 }
2310 *fmt++ = c;
2311 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002312}
2313
Victor Stinner96865452011-03-01 23:44:09 +00002314/* helper for PyUnicode_FromFormatV() */
2315
2316static const char*
2317parse_format_flags(const char *f,
2318 int *p_width, int *p_precision,
2319 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2320{
2321 int width, precision, longflag, longlongflag, size_tflag;
2322
2323 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2324 f++;
2325 width = 0;
2326 while (Py_ISDIGIT((unsigned)*f))
2327 width = (width*10) + *f++ - '0';
2328 precision = 0;
2329 if (*f == '.') {
2330 f++;
2331 while (Py_ISDIGIT((unsigned)*f))
2332 precision = (precision*10) + *f++ - '0';
2333 if (*f == '%') {
2334 /* "%.3%s" => f points to "3" */
2335 f--;
2336 }
2337 }
Serhiy Storchaka4dbc3052015-01-27 22:18:46 +02002338 if (width < precision)
2339 width = precision;
Victor Stinner96865452011-03-01 23:44:09 +00002340 if (*f == '\0') {
2341 /* bogus format "%.1" => go backward, f points to "1" */
2342 f--;
2343 }
2344 if (p_width != NULL)
2345 *p_width = width;
2346 if (p_precision != NULL)
2347 *p_precision = precision;
2348
2349 /* Handle %ld, %lu, %lld and %llu. */
2350 longflag = 0;
2351 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002352 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002353
2354 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002355 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002356 longflag = 1;
2357 ++f;
2358 }
2359#ifdef HAVE_LONG_LONG
2360 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002361 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002362 longlongflag = 1;
2363 f += 2;
2364 }
2365#endif
2366 }
2367 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002368 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002369 size_tflag = 1;
2370 ++f;
2371 }
2372 if (p_longflag != NULL)
2373 *p_longflag = longflag;
2374 if (p_longlongflag != NULL)
2375 *p_longlongflag = longlongflag;
2376 if (p_size_tflag != NULL)
2377 *p_size_tflag = size_tflag;
2378 return f;
2379}
2380
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002381/* maximum number of characters required for output of %ld. 21 characters
2382 allows for 64-bit integers (in decimal) and an optional sign. */
2383#define MAX_LONG_CHARS 21
2384/* maximum number of characters required for output of %lld.
2385 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2386 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2387#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2388
Walter Dörwaldd2034312007-05-18 16:29:38 +00002389PyObject *
2390PyUnicode_FromFormatV(const char *format, va_list vargs)
2391{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002392 va_list count;
2393 Py_ssize_t callcount = 0;
2394 PyObject **callresults = NULL;
2395 PyObject **callresult = NULL;
2396 Py_ssize_t n = 0;
2397 int width = 0;
2398 int precision = 0;
2399 int zeropad;
2400 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002401 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002402 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002403 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2405 Py_UCS4 argmaxchar;
2406 Py_ssize_t numbersize = 0;
2407 char *numberresults = NULL;
2408 char *numberresult = NULL;
2409 Py_ssize_t i;
2410 int kind;
2411 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002412
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002413 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002414 /* step 1: count the number of %S/%R/%A/%s format specifications
2415 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2416 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002418 * also estimate a upper bound for all the number formats in the string,
2419 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002421 for (f = format; *f; f++) {
2422 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002423 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2425 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2426 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2427 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002430#ifdef HAVE_LONG_LONG
2431 if (longlongflag) {
2432 if (width < MAX_LONG_LONG_CHARS)
2433 width = MAX_LONG_LONG_CHARS;
2434 }
2435 else
2436#endif
2437 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2438 including sign. Decimal takes the most space. This
2439 isn't enough for octal. If a width is specified we
2440 need more (which we allocate later). */
2441 if (width < MAX_LONG_CHARS)
2442 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443
2444 /* account for the size + '\0' to separate numbers
2445 inside of the numberresults buffer */
2446 numbersize += (width + 1);
2447 }
2448 }
2449 else if ((unsigned char)*f > 127) {
2450 PyErr_Format(PyExc_ValueError,
2451 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2452 "string, got a non-ASCII byte: 0x%02x",
2453 (unsigned char)*f);
2454 return NULL;
2455 }
2456 }
2457 /* step 2: allocate memory for the results of
2458 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2459 if (callcount) {
2460 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2461 if (!callresults) {
2462 PyErr_NoMemory();
2463 return NULL;
2464 }
2465 callresult = callresults;
2466 }
2467 /* step 2.5: allocate memory for the results of formating numbers */
2468 if (numbersize) {
2469 numberresults = PyObject_Malloc(numbersize);
2470 if (!numberresults) {
2471 PyErr_NoMemory();
2472 goto fail;
2473 }
2474 numberresult = numberresults;
2475 }
2476
2477 /* step 3: format numbers and figure out how large a buffer we need */
2478 for (f = format; *f; f++) {
2479 if (*f == '%') {
2480 const char* p;
2481 int longflag;
2482 int longlongflag;
2483 int size_tflag;
2484 int numprinted;
2485
2486 p = f;
2487 zeropad = (f[1] == '0');
2488 f = parse_format_flags(f, &width, &precision,
2489 &longflag, &longlongflag, &size_tflag);
2490 switch (*f) {
2491 case 'c':
2492 {
Serhiy Storchaka8eeae212013-06-23 20:12:14 +03002493 int ordinal = va_arg(count, int);
2494 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2495 PyErr_SetString(PyExc_OverflowError,
2496 "%c arg not in range(0x110000)");
2497 goto fail;
2498 }
2499 maxchar = Py_MAX(maxchar, (Py_UCS4)ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 n++;
2501 break;
2502 }
2503 case '%':
2504 n++;
2505 break;
2506 case 'i':
2507 case 'd':
2508 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2509 width, precision, *f);
2510 if (longflag)
2511 numprinted = sprintf(numberresult, fmt,
2512 va_arg(count, long));
2513#ifdef HAVE_LONG_LONG
2514 else if (longlongflag)
2515 numprinted = sprintf(numberresult, fmt,
2516 va_arg(count, PY_LONG_LONG));
2517#endif
2518 else if (size_tflag)
2519 numprinted = sprintf(numberresult, fmt,
2520 va_arg(count, Py_ssize_t));
2521 else
2522 numprinted = sprintf(numberresult, fmt,
2523 va_arg(count, int));
2524 n += numprinted;
2525 /* advance by +1 to skip over the '\0' */
2526 numberresult += (numprinted + 1);
2527 assert(*(numberresult - 1) == '\0');
2528 assert(*(numberresult - 2) != '\0');
2529 assert(numprinted >= 0);
2530 assert(numberresult <= numberresults + numbersize);
2531 break;
2532 case 'u':
2533 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2534 width, precision, 'u');
2535 if (longflag)
2536 numprinted = sprintf(numberresult, fmt,
2537 va_arg(count, unsigned long));
2538#ifdef HAVE_LONG_LONG
2539 else if (longlongflag)
2540 numprinted = sprintf(numberresult, fmt,
2541 va_arg(count, unsigned PY_LONG_LONG));
2542#endif
2543 else if (size_tflag)
2544 numprinted = sprintf(numberresult, fmt,
2545 va_arg(count, size_t));
2546 else
2547 numprinted = sprintf(numberresult, fmt,
2548 va_arg(count, unsigned int));
2549 n += numprinted;
2550 numberresult += (numprinted + 1);
2551 assert(*(numberresult - 1) == '\0');
2552 assert(*(numberresult - 2) != '\0');
2553 assert(numprinted >= 0);
2554 assert(numberresult <= numberresults + numbersize);
2555 break;
2556 case 'x':
2557 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2558 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2559 n += numprinted;
2560 numberresult += (numprinted + 1);
2561 assert(*(numberresult - 1) == '\0');
2562 assert(*(numberresult - 2) != '\0');
2563 assert(numprinted >= 0);
2564 assert(numberresult <= numberresults + numbersize);
2565 break;
2566 case 'p':
2567 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2568 /* %p is ill-defined: ensure leading 0x. */
2569 if (numberresult[1] == 'X')
2570 numberresult[1] = 'x';
2571 else if (numberresult[1] != 'x') {
2572 memmove(numberresult + 2, numberresult,
2573 strlen(numberresult) + 1);
2574 numberresult[0] = '0';
2575 numberresult[1] = 'x';
2576 numprinted += 2;
2577 }
2578 n += numprinted;
2579 numberresult += (numprinted + 1);
2580 assert(*(numberresult - 1) == '\0');
2581 assert(*(numberresult - 2) != '\0');
2582 assert(numprinted >= 0);
2583 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
2585 case 's':
2586 {
2587 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002588 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002589 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002590 if (!str)
2591 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 /* since PyUnicode_DecodeUTF8 returns already flexible
2593 unicode objects, there is no need to call ready on them */
2594 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002595 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002596 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002597 /* Remember the str and switch to the next slot */
2598 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 break;
2600 }
2601 case 'U':
2602 {
2603 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002604 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 if (PyUnicode_READY(obj) == -1)
2606 goto fail;
2607 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002608 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 break;
2611 }
2612 case 'V':
2613 {
2614 PyObject *obj = va_arg(count, PyObject *);
2615 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002616 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002618 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002619 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 if (PyUnicode_READY(obj) == -1)
2621 goto fail;
2622 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002623 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002625 *callresult++ = NULL;
2626 }
2627 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002628 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002629 if (!str_obj)
2630 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002631 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002632 Py_DECREF(str_obj);
2633 goto fail;
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002636 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002638 *callresult++ = str_obj;
2639 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 break;
2641 }
2642 case 'S':
2643 {
2644 PyObject *obj = va_arg(count, PyObject *);
2645 PyObject *str;
2646 assert(obj);
2647 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002648 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002650 if (PyUnicode_READY(str) == -1) {
2651 Py_DECREF(str);
2652 goto fail;
2653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002655 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* Remember the str and switch to the next slot */
2658 *callresult++ = str;
2659 break;
2660 }
2661 case 'R':
2662 {
2663 PyObject *obj = va_arg(count, PyObject *);
2664 PyObject *repr;
2665 assert(obj);
2666 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002669 if (PyUnicode_READY(repr) == -1) {
2670 Py_DECREF(repr);
2671 goto fail;
2672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002674 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 /* Remember the repr and switch to the next slot */
2677 *callresult++ = repr;
2678 break;
2679 }
2680 case 'A':
2681 {
2682 PyObject *obj = va_arg(count, PyObject *);
2683 PyObject *ascii;
2684 assert(obj);
2685 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002686 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002688 if (PyUnicode_READY(ascii) == -1) {
2689 Py_DECREF(ascii);
2690 goto fail;
2691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Benjamin Peterson7e303732013-06-10 09:19:46 -07002693 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002695 /* Remember the repr and switch to the next slot */
2696 *callresult++ = ascii;
2697 break;
2698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 default:
2700 /* if we stumble upon an unknown
2701 formatting code, copy the rest of
2702 the format string to the output
2703 string. (we cannot just skip the
2704 code, since there's no way to know
2705 what's in the argument list) */
2706 n += strlen(p);
2707 goto expand;
2708 }
2709 } else
2710 n++;
2711 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002715 we don't have to resize the string.
2716 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002717 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 if (!string)
2719 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 kind = PyUnicode_KIND(string);
2721 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002727 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002728
2729 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2731 /* checking for == because the last argument could be a empty
2732 string, which causes i to point to end, the assert at the end of
2733 the loop */
2734 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002735
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 switch (*f) {
2737 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002738 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 const int ordinal = va_arg(vargs, int);
2740 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002742 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002743 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002748 {
Victor Stinner184252a2012-06-16 02:57:41 +02002749 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 /* unused, since we already have the result */
2751 if (*f == 'p')
2752 (void) va_arg(vargs, void *);
2753 else
2754 (void) va_arg(vargs, int);
2755 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002756 len = strlen(numberresult);
2757 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002759 i += len;
2760 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 assert(*numberresult == '\0');
2762 numberresult++;
2763 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002764 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 case 's':
2767 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002768 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002770 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002771 size = PyUnicode_GET_LENGTH(*callresult);
2772 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002773 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002775 /* We're done with the unicode()/repr() => forget it */
2776 Py_DECREF(*callresult);
2777 /* switch to next unicode()/repr() result */
2778 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002779 break;
2780 }
2781 case 'U':
2782 {
2783 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 Py_ssize_t size;
2785 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2786 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002787 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 break;
2790 }
2791 case 'V':
2792 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002794 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002795 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 size = PyUnicode_GET_LENGTH(obj);
2798 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002799 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002801 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 size = PyUnicode_GET_LENGTH(*callresult);
2803 assert(PyUnicode_KIND(*callresult) <=
2804 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002805 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002807 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002808 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002809 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 break;
2811 }
2812 case 'S':
2813 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002814 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002815 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002816 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 /* unused, since we already have the result */
2818 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002820 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002821 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002822 /* We're done with the unicode()/repr() => forget it */
2823 Py_DECREF(*callresult);
2824 /* switch to next unicode()/repr() result */
2825 ++callresult;
2826 break;
2827 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002828 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002830 break;
2831 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002832 {
2833 Py_ssize_t len = strlen(p);
2834 unicode_write_cstr(string, i, p, len);
2835 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002837 goto end;
2838 }
Victor Stinner184252a2012-06-16 02:57:41 +02002839 }
Victor Stinner1205f272010-09-11 00:54:47 +00002840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 else {
2842 assert(i < PyUnicode_GET_LENGTH(string));
2843 PyUnicode_WRITE(kind, data, i++, *f);
2844 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002846 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002847
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002849 if (callresults)
2850 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 if (numberresults)
2852 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002853 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002855 if (callresults) {
2856 PyObject **callresult2 = callresults;
2857 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002858 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002859 ++callresult2;
2860 }
2861 PyObject_Free(callresults);
2862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002863 if (numberresults)
2864 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002865 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002866}
2867
Walter Dörwaldd2034312007-05-18 16:29:38 +00002868PyObject *
2869PyUnicode_FromFormat(const char *format, ...)
2870{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002871 PyObject* ret;
2872 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002873
2874#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002876#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002878#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002879 ret = PyUnicode_FromFormatV(format, vargs);
2880 va_end(vargs);
2881 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002882}
2883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002884#ifdef HAVE_WCHAR_H
2885
Victor Stinner5593d8a2010-10-02 11:11:27 +00002886/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2887 convert a Unicode object to a wide character string.
2888
Victor Stinnerd88d9832011-09-06 02:00:05 +02002889 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002890 character) required to convert the unicode object. Ignore size argument.
2891
Victor Stinnerd88d9832011-09-06 02:00:05 +02002892 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002893 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002894 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002895static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002896unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002897 wchar_t *w,
2898 Py_ssize_t size)
2899{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002900 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 const wchar_t *wstr;
2902
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002903 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002904 if (wstr == NULL)
2905 return -1;
2906
Victor Stinner5593d8a2010-10-02 11:11:27 +00002907 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002908 if (size > res)
2909 size = res + 1;
2910 else
2911 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002913 return res;
2914 }
2915 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002916 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002917}
2918
2919Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002920PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002921 wchar_t *w,
2922 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923{
2924 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002925 PyErr_BadInternalCall();
2926 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002928 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929}
2930
Victor Stinner137c34c2010-09-29 10:25:54 +00002931wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002932PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002933 Py_ssize_t *size)
2934{
2935 wchar_t* buffer;
2936 Py_ssize_t buflen;
2937
2938 if (unicode == NULL) {
2939 PyErr_BadInternalCall();
2940 return NULL;
2941 }
2942
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002943 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944 if (buflen == -1)
2945 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002946 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002947 PyErr_NoMemory();
2948 return NULL;
2949 }
2950
Victor Stinner137c34c2010-09-29 10:25:54 +00002951 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2952 if (buffer == NULL) {
2953 PyErr_NoMemory();
2954 return NULL;
2955 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002956 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002957 if (buflen == -1) {
2958 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002960 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002961 if (size != NULL)
2962 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002963 return buffer;
2964}
2965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967
Alexander Belopolsky40018472011-02-26 01:02:56 +00002968PyObject *
2969PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002972 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 PyErr_SetString(PyExc_ValueError,
2974 "chr() arg not in range(0x110000)");
2975 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002976 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002977
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002978 if ((Py_UCS4)ordinal < 256)
2979 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 v = PyUnicode_New(1, ordinal);
2982 if (v == NULL)
2983 return NULL;
2984 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002985 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002986 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002987}
2988
Alexander Belopolsky40018472011-02-26 01:02:56 +00002989PyObject *
2990PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002992 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002994 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002995 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002996 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 Py_INCREF(obj);
2998 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002999 }
3000 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 /* For a Unicode subtype that's not a Unicode object,
3002 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003003 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003004 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003005 PyErr_Format(PyExc_TypeError,
3006 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003007 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003008 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003009}
3010
Alexander Belopolsky40018472011-02-26 01:02:56 +00003011PyObject *
3012PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003013 const char *encoding,
3014 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003015{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003016 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003017 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003018
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyErr_BadInternalCall();
3021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003023
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003024 /* Decoding bytes objects is the most common case and should be fast */
3025 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003026 if (PyBytes_GET_SIZE(obj) == 0)
3027 _Py_RETURN_UNICODE_EMPTY();
3028 v = PyUnicode_Decode(
3029 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3030 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003031 return v;
3032 }
3033
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 PyErr_SetString(PyExc_TypeError,
3036 "decoding str is not supported");
3037 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003038 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003040 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3041 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3042 PyErr_Format(PyExc_TypeError,
3043 "coercing to str: need bytes, bytearray "
3044 "or buffer-like object, %.80s found",
3045 Py_TYPE(obj)->tp_name);
3046 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003047 }
Tim Petersced69f82003-09-16 20:30:58 +00003048
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003049 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003050 PyBuffer_Release(&buffer);
3051 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003053
Serhiy Storchaka05997252013-01-26 12:14:02 +02003054 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003055 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003056 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057}
3058
Victor Stinner600d3be2010-06-10 12:00:55 +00003059/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003060 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3061 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003062int
3063_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003064 char *lower,
3065 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003067 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003068 char *l;
3069 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003070
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003071 if (encoding == NULL) {
3072 strcpy(lower, "utf-8");
3073 return 1;
3074 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003075 e = encoding;
3076 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003077 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003078 while (*e) {
3079 if (l == l_end)
3080 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003081 if (Py_ISUPPER(*e)) {
3082 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003083 }
3084 else if (*e == '_') {
3085 *l++ = '-';
3086 e++;
3087 }
3088 else {
3089 *l++ = *e++;
3090 }
3091 }
3092 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003093 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003094}
3095
Alexander Belopolsky40018472011-02-26 01:02:56 +00003096PyObject *
3097PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003098 Py_ssize_t size,
3099 const char *encoding,
3100 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003101{
3102 PyObject *buffer = NULL, *unicode;
3103 Py_buffer info;
3104 char lower[11]; /* Enough for any encoding shortcut */
3105
Fred Drakee4315f52000-05-09 19:53:39 +00003106 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003107 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003108 if ((strcmp(lower, "utf-8") == 0) ||
3109 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003110 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003111 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003112 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003113 (strcmp(lower, "iso-8859-1") == 0))
3114 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003115#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003116 else if (strcmp(lower, "mbcs") == 0)
3117 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003118#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003119 else if (strcmp(lower, "ascii") == 0)
3120 return PyUnicode_DecodeASCII(s, size, errors);
3121 else if (strcmp(lower, "utf-16") == 0)
3122 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3123 else if (strcmp(lower, "utf-32") == 0)
3124 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3125 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126
3127 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003128 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003129 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003130 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003131 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132 if (buffer == NULL)
3133 goto onError;
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02003134 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135 if (unicode == NULL)
3136 goto onError;
3137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003139 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003140 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 Py_DECREF(unicode);
3142 goto onError;
3143 }
3144 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003145 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003146
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148 Py_XDECREF(buffer);
3149 return NULL;
3150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 const char *encoding,
3155 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156{
3157 PyObject *v;
3158
3159 if (!PyUnicode_Check(unicode)) {
3160 PyErr_BadArgument();
3161 goto onError;
3162 }
3163
3164 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003166
3167 /* Decode via the codec registry */
3168 v = PyCodec_Decode(unicode, encoding, errors);
3169 if (v == NULL)
3170 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003171 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003172
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003174 return NULL;
3175}
3176
Alexander Belopolsky40018472011-02-26 01:02:56 +00003177PyObject *
3178PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003179 const char *encoding,
3180 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181{
3182 PyObject *v;
3183
3184 if (!PyUnicode_Check(unicode)) {
3185 PyErr_BadArgument();
3186 goto onError;
3187 }
3188
3189 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003190 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003191
3192 /* Decode via the codec registry */
3193 v = PyCodec_Decode(unicode, encoding, errors);
3194 if (v == NULL)
3195 goto onError;
3196 if (!PyUnicode_Check(v)) {
3197 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003198 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199 Py_TYPE(v)->tp_name);
3200 Py_DECREF(v);
3201 goto onError;
3202 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003203 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003206 return NULL;
3207}
3208
Alexander Belopolsky40018472011-02-26 01:02:56 +00003209PyObject *
3210PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003211 Py_ssize_t size,
3212 const char *encoding,
3213 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214{
3215 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003216
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 unicode = PyUnicode_FromUnicode(s, size);
3218 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3221 Py_DECREF(unicode);
3222 return v;
3223}
3224
Alexander Belopolsky40018472011-02-26 01:02:56 +00003225PyObject *
3226PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003227 const char *encoding,
3228 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003229{
3230 PyObject *v;
3231
3232 if (!PyUnicode_Check(unicode)) {
3233 PyErr_BadArgument();
3234 goto onError;
3235 }
3236
3237 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003239
3240 /* Encode via the codec registry */
3241 v = PyCodec_Encode(unicode, encoding, errors);
3242 if (v == NULL)
3243 goto onError;
3244 return v;
3245
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003247 return NULL;
3248}
3249
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250static size_t
3251wcstombs_errorpos(const wchar_t *wstr)
3252{
3253 size_t len;
3254#if SIZEOF_WCHAR_T == 2
3255 wchar_t buf[3];
3256#else
3257 wchar_t buf[2];
3258#endif
3259 char outbuf[MB_LEN_MAX];
3260 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003261
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003262#if SIZEOF_WCHAR_T == 2
3263 buf[2] = 0;
3264#else
3265 buf[1] = 0;
3266#endif
3267 start = wstr;
3268 while (*wstr != L'\0')
3269 {
3270 previous = wstr;
3271#if SIZEOF_WCHAR_T == 2
3272 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3273 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3274 {
3275 buf[0] = wstr[0];
3276 buf[1] = wstr[1];
3277 wstr += 2;
3278 }
3279 else {
3280 buf[0] = *wstr;
3281 buf[1] = 0;
3282 wstr++;
3283 }
3284#else
3285 buf[0] = *wstr;
3286 wstr++;
3287#endif
3288 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003289 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291 }
3292
3293 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003294 return 0;
3295}
3296
Victor Stinner1b579672011-12-17 05:47:23 +01003297static int
3298locale_error_handler(const char *errors, int *surrogateescape)
3299{
3300 if (errors == NULL) {
3301 *surrogateescape = 0;
3302 return 0;
3303 }
3304
3305 if (strcmp(errors, "strict") == 0) {
3306 *surrogateescape = 0;
3307 return 0;
3308 }
3309 if (strcmp(errors, "surrogateescape") == 0) {
3310 *surrogateescape = 1;
3311 return 0;
3312 }
3313 PyErr_Format(PyExc_ValueError,
3314 "only 'strict' and 'surrogateescape' error handlers "
3315 "are supported, not '%s'",
3316 errors);
3317 return -1;
3318}
3319
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003321PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003322{
3323 Py_ssize_t wlen, wlen2;
3324 wchar_t *wstr;
3325 PyObject *bytes = NULL;
3326 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003327 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328 PyObject *exc;
3329 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003330 int surrogateescape;
3331
3332 if (locale_error_handler(errors, &surrogateescape) < 0)
3333 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003334
3335 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3336 if (wstr == NULL)
3337 return NULL;
3338
3339 wlen2 = wcslen(wstr);
3340 if (wlen2 != wlen) {
3341 PyMem_Free(wstr);
3342 PyErr_SetString(PyExc_TypeError, "embedded null character");
3343 return NULL;
3344 }
3345
3346 if (surrogateescape) {
3347 /* locale encoding with surrogateescape */
3348 char *str;
3349
3350 str = _Py_wchar2char(wstr, &error_pos);
3351 if (str == NULL) {
3352 if (error_pos == (size_t)-1) {
3353 PyErr_NoMemory();
3354 PyMem_Free(wstr);
3355 return NULL;
3356 }
3357 else {
3358 goto encode_error;
3359 }
3360 }
3361 PyMem_Free(wstr);
3362
3363 bytes = PyBytes_FromString(str);
3364 PyMem_Free(str);
3365 }
3366 else {
3367 size_t len, len2;
3368
3369 len = wcstombs(NULL, wstr, 0);
3370 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003371 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372 goto encode_error;
3373 }
3374
3375 bytes = PyBytes_FromStringAndSize(NULL, len);
3376 if (bytes == NULL) {
3377 PyMem_Free(wstr);
3378 return NULL;
3379 }
3380
3381 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3382 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003383 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003384 goto encode_error;
3385 }
3386 PyMem_Free(wstr);
3387 }
3388 return bytes;
3389
3390encode_error:
3391 errmsg = strerror(errno);
3392 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003393
3394 if (error_pos == (size_t)-1)
3395 error_pos = wcstombs_errorpos(wstr);
3396
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003397 PyMem_Free(wstr);
3398 Py_XDECREF(bytes);
3399
Victor Stinner2f197072011-12-17 07:08:30 +01003400 if (errmsg != NULL) {
3401 size_t errlen;
3402 wstr = _Py_char2wchar(errmsg, &errlen);
3403 if (wstr != NULL) {
3404 reason = PyUnicode_FromWideChar(wstr, errlen);
3405 PyMem_Free(wstr);
3406 } else
3407 errmsg = NULL;
3408 }
3409 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003410 reason = PyUnicode_FromString(
3411 "wcstombs() encountered an unencodable "
3412 "wide character");
3413 if (reason == NULL)
3414 return NULL;
3415
3416 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3417 "locale", unicode,
3418 (Py_ssize_t)error_pos,
3419 (Py_ssize_t)(error_pos+1),
3420 reason);
3421 Py_DECREF(reason);
3422 if (exc != NULL) {
3423 PyCodec_StrictErrors(exc);
3424 Py_XDECREF(exc);
3425 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426 return NULL;
3427}
3428
Victor Stinnerad158722010-10-27 00:25:46 +00003429PyObject *
3430PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003431{
Victor Stinner99b95382011-07-04 14:23:54 +02003432#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003433 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003434#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003435 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003436#else
Victor Stinner793b5312011-04-27 00:24:21 +02003437 PyInterpreterState *interp = PyThreadState_GET()->interp;
3438 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3439 cannot use it to encode and decode filenames before it is loaded. Load
3440 the Python codec requires to encode at least its own filename. Use the C
3441 version of the locale codec until the codec registry is initialized and
3442 the Python codec is loaded.
3443
3444 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3445 cannot only rely on it: check also interp->fscodec_initialized for
3446 subinterpreters. */
3447 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003448 return PyUnicode_AsEncodedString(unicode,
3449 Py_FileSystemDefaultEncoding,
3450 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003451 }
3452 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003453 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003454 }
Victor Stinnerad158722010-10-27 00:25:46 +00003455#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003456}
3457
Alexander Belopolsky40018472011-02-26 01:02:56 +00003458PyObject *
3459PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003460 const char *encoding,
3461 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462{
3463 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003464 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003465
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 if (!PyUnicode_Check(unicode)) {
3467 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 }
Fred Drakee4315f52000-05-09 19:53:39 +00003470
Fred Drakee4315f52000-05-09 19:53:39 +00003471 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003472 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003473 if ((strcmp(lower, "utf-8") == 0) ||
3474 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003475 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003476 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003477 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003478 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003480 }
Victor Stinner37296e82010-06-10 13:36:23 +00003481 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003482 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003483 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003485#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003486 else if (strcmp(lower, "mbcs") == 0)
3487 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003488#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003489 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003490 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492
3493 /* Encode via the codec registry */
Serhiy Storchaka94ee3892014-02-24 14:43:03 +02003494 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 return NULL;
3497
3498 /* The normal path */
3499 if (PyBytes_Check(v))
3500 return v;
3501
3502 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003503 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003504 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003505 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003506
3507 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3508 "encoder %s returned bytearray instead of bytes",
3509 encoding);
3510 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003511 Py_DECREF(v);
3512 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003514
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003515 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3516 Py_DECREF(v);
3517 return b;
3518 }
3519
3520 PyErr_Format(PyExc_TypeError,
3521 "encoder did not return a bytes object (type=%.400s)",
3522 Py_TYPE(v)->tp_name);
3523 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003524 return NULL;
3525}
3526
Alexander Belopolsky40018472011-02-26 01:02:56 +00003527PyObject *
3528PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003529 const char *encoding,
3530 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003531{
3532 PyObject *v;
3533
3534 if (!PyUnicode_Check(unicode)) {
3535 PyErr_BadArgument();
3536 goto onError;
3537 }
3538
3539 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003541
3542 /* Encode via the codec registry */
3543 v = PyCodec_Encode(unicode, encoding, errors);
3544 if (v == NULL)
3545 goto onError;
3546 if (!PyUnicode_Check(v)) {
3547 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003548 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003549 Py_TYPE(v)->tp_name);
3550 Py_DECREF(v);
3551 goto onError;
3552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003554
Benjamin Peterson29060642009-01-31 22:14:21 +00003555 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return NULL;
3557}
3558
Victor Stinner2f197072011-12-17 07:08:30 +01003559static size_t
3560mbstowcs_errorpos(const char *str, size_t len)
3561{
3562#ifdef HAVE_MBRTOWC
3563 const char *start = str;
3564 mbstate_t mbs;
3565 size_t converted;
3566 wchar_t ch;
3567
3568 memset(&mbs, 0, sizeof mbs);
3569 while (len)
3570 {
3571 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3572 if (converted == 0)
3573 /* Reached end of string */
3574 break;
3575 if (converted == (size_t)-1 || converted == (size_t)-2) {
3576 /* Conversion error or incomplete character */
3577 return str - start;
3578 }
3579 else {
3580 str += converted;
3581 len -= converted;
3582 }
3583 }
3584 /* failed to find the undecodable byte sequence */
3585 return 0;
3586#endif
3587 return 0;
3588}
3589
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003590PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003591PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003592 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003593{
3594 wchar_t smallbuf[256];
3595 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3596 wchar_t *wstr;
3597 size_t wlen, wlen2;
3598 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003599 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003600 size_t error_pos;
3601 char *errmsg;
3602 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003603
3604 if (locale_error_handler(errors, &surrogateescape) < 0)
3605 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003606
3607 if (str[len] != '\0' || len != strlen(str)) {
3608 PyErr_SetString(PyExc_TypeError, "embedded null character");
3609 return NULL;
3610 }
3611
3612 if (surrogateescape)
3613 {
3614 wstr = _Py_char2wchar(str, &wlen);
3615 if (wstr == NULL) {
3616 if (wlen == (size_t)-1)
3617 PyErr_NoMemory();
3618 else
3619 PyErr_SetFromErrno(PyExc_OSError);
3620 return NULL;
3621 }
3622
3623 unicode = PyUnicode_FromWideChar(wstr, wlen);
3624 PyMem_Free(wstr);
3625 }
3626 else {
3627#ifndef HAVE_BROKEN_MBSTOWCS
3628 wlen = mbstowcs(NULL, str, 0);
3629#else
3630 wlen = len;
3631#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003632 if (wlen == (size_t)-1)
3633 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634 if (wlen+1 <= smallbuf_len) {
3635 wstr = smallbuf;
3636 }
3637 else {
3638 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3639 return PyErr_NoMemory();
3640
3641 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3642 if (!wstr)
3643 return PyErr_NoMemory();
3644 }
3645
3646 /* This shouldn't fail now */
3647 wlen2 = mbstowcs(wstr, str, wlen+1);
3648 if (wlen2 == (size_t)-1) {
3649 if (wstr != smallbuf)
3650 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003651 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003652 }
3653#ifdef HAVE_BROKEN_MBSTOWCS
3654 assert(wlen2 == wlen);
3655#endif
3656 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3657 if (wstr != smallbuf)
3658 PyMem_Free(wstr);
3659 }
3660 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003661
3662decode_error:
3663 errmsg = strerror(errno);
3664 assert(errmsg != NULL);
3665
3666 error_pos = mbstowcs_errorpos(str, len);
3667 if (errmsg != NULL) {
3668 size_t errlen;
3669 wstr = _Py_char2wchar(errmsg, &errlen);
3670 if (wstr != NULL) {
3671 reason = PyUnicode_FromWideChar(wstr, errlen);
3672 PyMem_Free(wstr);
3673 } else
3674 errmsg = NULL;
3675 }
3676 if (errmsg == NULL)
3677 reason = PyUnicode_FromString(
3678 "mbstowcs() encountered an invalid multibyte sequence");
3679 if (reason == NULL)
3680 return NULL;
3681
3682 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3683 "locale", str, len,
3684 (Py_ssize_t)error_pos,
3685 (Py_ssize_t)(error_pos+1),
3686 reason);
3687 Py_DECREF(reason);
3688 if (exc != NULL) {
3689 PyCodec_StrictErrors(exc);
3690 Py_XDECREF(exc);
3691 }
3692 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003693}
3694
3695PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003696PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003697{
3698 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003699 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003700}
3701
3702
3703PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003704PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003705 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003706 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3707}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003708
Christian Heimes5894ba72007-11-04 11:43:14 +00003709PyObject*
3710PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3711{
Victor Stinner99b95382011-07-04 14:23:54 +02003712#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003713 return PyUnicode_DecodeMBCS(s, size, NULL);
3714#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003715 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003716#else
Victor Stinner793b5312011-04-27 00:24:21 +02003717 PyInterpreterState *interp = PyThreadState_GET()->interp;
3718 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3719 cannot use it to encode and decode filenames before it is loaded. Load
3720 the Python codec requires to encode at least its own filename. Use the C
3721 version of the locale codec until the codec registry is initialized and
3722 the Python codec is loaded.
3723
3724 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3725 cannot only rely on it: check also interp->fscodec_initialized for
3726 subinterpreters. */
3727 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003728 return PyUnicode_Decode(s, size,
3729 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003730 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003731 }
3732 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003733 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003734 }
Victor Stinnerad158722010-10-27 00:25:46 +00003735#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003736}
3737
Martin v. Löwis011e8422009-05-05 04:43:17 +00003738
3739int
Antoine Pitrou13348842012-01-29 18:36:34 +01003740_PyUnicode_HasNULChars(PyObject* s)
3741{
3742 static PyObject *nul = NULL;
3743
3744 if (nul == NULL)
3745 nul = PyUnicode_FromStringAndSize("\0", 1);
3746 if (nul == NULL)
3747 return -1;
3748 return PyUnicode_Contains(s, nul);
3749}
3750
3751
3752int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753PyUnicode_FSConverter(PyObject* arg, void* addr)
3754{
3755 PyObject *output = NULL;
3756 Py_ssize_t size;
3757 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003758 if (arg == NULL) {
3759 Py_DECREF(*(PyObject**)addr);
3760 return 1;
3761 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003762 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003763 output = arg;
3764 Py_INCREF(output);
3765 }
3766 else {
3767 arg = PyUnicode_FromObject(arg);
3768 if (!arg)
3769 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003770 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003771 Py_DECREF(arg);
3772 if (!output)
3773 return 0;
3774 if (!PyBytes_Check(output)) {
3775 Py_DECREF(output);
3776 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3777 return 0;
3778 }
3779 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003780 size = PyBytes_GET_SIZE(output);
3781 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003783 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003784 Py_DECREF(output);
3785 return 0;
3786 }
3787 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003788 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003789}
3790
3791
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003792int
3793PyUnicode_FSDecoder(PyObject* arg, void* addr)
3794{
3795 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003796 if (arg == NULL) {
3797 Py_DECREF(*(PyObject**)addr);
3798 return 1;
3799 }
3800 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003801 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003802 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003803 output = arg;
3804 Py_INCREF(output);
3805 }
3806 else {
3807 arg = PyBytes_FromObject(arg);
3808 if (!arg)
3809 return 0;
3810 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3811 PyBytes_GET_SIZE(arg));
3812 Py_DECREF(arg);
3813 if (!output)
3814 return 0;
3815 if (!PyUnicode_Check(output)) {
3816 Py_DECREF(output);
3817 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3818 return 0;
3819 }
3820 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003821 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003822 Py_DECREF(output);
3823 return 0;
3824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003826 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003827 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3828 Py_DECREF(output);
3829 return 0;
3830 }
3831 *(PyObject**)addr = output;
3832 return Py_CLEANUP_SUPPORTED;
3833}
3834
3835
Martin v. Löwis5b222132007-06-10 09:51:05 +00003836char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003838{
Christian Heimesf3863112007-11-22 07:46:41 +00003839 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003841 if (!PyUnicode_Check(unicode)) {
3842 PyErr_BadArgument();
3843 return NULL;
3844 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003845 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003846 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003848 if (PyUnicode_UTF8(unicode) == NULL) {
3849 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3851 if (bytes == NULL)
3852 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3854 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 Py_DECREF(bytes);
3856 return NULL;
3857 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003858 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3859 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3860 PyBytes_AS_STRING(bytes),
3861 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 Py_DECREF(bytes);
3863 }
3864
3865 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003866 *psize = PyUnicode_UTF8_LENGTH(unicode);
3867 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003868}
3869
3870char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3874}
3875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876Py_UNICODE *
3877PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 const unsigned char *one_byte;
3880#if SIZEOF_WCHAR_T == 4
3881 const Py_UCS2 *two_bytes;
3882#else
3883 const Py_UCS4 *four_bytes;
3884 const Py_UCS4 *ucs4_end;
3885 Py_ssize_t num_surrogates;
3886#endif
3887 wchar_t *w;
3888 wchar_t *wchar_end;
3889
3890 if (!PyUnicode_Check(unicode)) {
3891 PyErr_BadArgument();
3892 return NULL;
3893 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 assert(_PyUnicode_KIND(unicode) != 0);
3897 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003899 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3902 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 num_surrogates = 0;
3904
3905 for (; four_bytes < ucs4_end; ++four_bytes) {
3906 if (*four_bytes > 0xFFFF)
3907 ++num_surrogates;
3908 }
3909
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003910 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3911 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3912 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 PyErr_NoMemory();
3914 return NULL;
3915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003918 w = _PyUnicode_WSTR(unicode);
3919 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3920 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3922 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003923 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003925 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3926 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927 }
3928 else
3929 *w = *four_bytes;
3930
3931 if (w > wchar_end) {
3932 assert(0 && "Miscalculated string end");
3933 }
3934 }
3935 *w = 0;
3936#else
3937 /* sizeof(wchar_t) == 4 */
3938 Py_FatalError("Impossible unicode object state, wstr and str "
3939 "should share memory already.");
3940 return NULL;
3941#endif
3942 }
3943 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003944 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3945 (_PyUnicode_LENGTH(unicode) + 1));
3946 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 PyErr_NoMemory();
3948 return NULL;
3949 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3951 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3952 w = _PyUnicode_WSTR(unicode);
3953 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3956 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 for (; w < wchar_end; ++one_byte, ++w)
3958 *w = *one_byte;
3959 /* null-terminate the wstr */
3960 *w = 0;
3961 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003962 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003964 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003965 for (; w < wchar_end; ++two_bytes, ++w)
3966 *w = *two_bytes;
3967 /* null-terminate the wstr */
3968 *w = 0;
3969#else
3970 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003971 PyObject_FREE(_PyUnicode_WSTR(unicode));
3972 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 Py_FatalError("Impossible unicode object state, wstr "
3974 "and str should share memory already.");
3975 return NULL;
3976#endif
3977 }
3978 else {
3979 assert(0 && "This should never happen.");
3980 }
3981 }
3982 }
3983 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 *size = PyUnicode_WSTR_LENGTH(unicode);
3985 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003986}
3987
Alexander Belopolsky40018472011-02-26 01:02:56 +00003988Py_UNICODE *
3989PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992}
3993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994
Alexander Belopolsky40018472011-02-26 01:02:56 +00003995Py_ssize_t
3996PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997{
3998 if (!PyUnicode_Check(unicode)) {
3999 PyErr_BadArgument();
4000 goto onError;
4001 }
4002 return PyUnicode_GET_SIZE(unicode);
4003
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 return -1;
4006}
4007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008Py_ssize_t
4009PyUnicode_GetLength(PyObject *unicode)
4010{
Victor Stinner07621332012-06-16 04:53:46 +02004011 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 PyErr_BadArgument();
4013 return -1;
4014 }
Victor Stinner07621332012-06-16 04:53:46 +02004015 if (PyUnicode_READY(unicode) == -1)
4016 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return PyUnicode_GET_LENGTH(unicode);
4018}
4019
4020Py_UCS4
4021PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4022{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004023 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4024 PyErr_BadArgument();
4025 return (Py_UCS4)-1;
4026 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004027 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004028 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 return (Py_UCS4)-1;
4030 }
4031 return PyUnicode_READ_CHAR(unicode, index);
4032}
4033
4034int
4035PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4036{
4037 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004038 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 return -1;
4040 }
Victor Stinner488fa492011-12-12 00:01:39 +01004041 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004042 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004043 PyErr_SetString(PyExc_IndexError, "string index out of range");
4044 return -1;
4045 }
Victor Stinner488fa492011-12-12 00:01:39 +01004046 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004047 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004048 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4049 PyErr_SetString(PyExc_ValueError, "character out of range");
4050 return -1;
4051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4053 index, ch);
4054 return 0;
4055}
4056
Alexander Belopolsky40018472011-02-26 01:02:56 +00004057const char *
4058PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004059{
Victor Stinner42cb4622010-09-01 19:39:01 +00004060 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004061}
4062
Victor Stinner554f3f02010-06-16 23:33:54 +00004063/* create or adjust a UnicodeDecodeError */
4064static void
4065make_decode_exception(PyObject **exceptionObject,
4066 const char *encoding,
4067 const char *input, Py_ssize_t length,
4068 Py_ssize_t startpos, Py_ssize_t endpos,
4069 const char *reason)
4070{
4071 if (*exceptionObject == NULL) {
4072 *exceptionObject = PyUnicodeDecodeError_Create(
4073 encoding, input, length, startpos, endpos, reason);
4074 }
4075 else {
4076 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4077 goto onError;
4078 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4079 goto onError;
4080 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4081 goto onError;
4082 }
4083 return;
4084
4085onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004086 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004087}
4088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089/* error handling callback helper:
4090 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004091 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 and adjust various state variables.
4093 return 0 on success, -1 on error
4094*/
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096static int
4097unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004098 const char *encoding, const char *reason,
4099 const char **input, const char **inend, Py_ssize_t *startinpos,
4100 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004101 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004103 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104
4105 PyObject *restuple = NULL;
4106 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004107 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004108 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004109 Py_ssize_t requiredsize;
4110 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004111 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 int res = -1;
4113
Victor Stinner596a6c42011-11-09 00:02:18 +01004114 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4115 outsize = PyUnicode_GET_LENGTH(*output);
4116 else
4117 outsize = _PyUnicode_WSTR_LENGTH(*output);
4118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 *errorHandler = PyCodec_LookupError(errors);
4121 if (*errorHandler == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 }
4124
Victor Stinner554f3f02010-06-16 23:33:54 +00004125 make_decode_exception(exceptionObject,
4126 encoding,
4127 *input, *inend - *input,
4128 *startinpos, *endinpos,
4129 reason);
4130 if (*exceptionObject == NULL)
4131 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132
4133 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4134 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004137 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 }
4140 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004142 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004143 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144
4145 /* Copy back the bytes variables, which might have been modified by the
4146 callback */
4147 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4148 if (!inputobj)
4149 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004150 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004152 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004153 *input = PyBytes_AS_STRING(inputobj);
4154 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004156 /* we can DECREF safely, as the exception has another reference,
4157 so the object won't go away. */
4158 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004162 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4164 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166
Victor Stinner596a6c42011-11-09 00:02:18 +01004167 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4168 /* need more space? (at least enough for what we
4169 have+the replacement+the rest of the string (starting
4170 at the new input position), so we won't have to check space
4171 when there are no errors in the rest of the string) */
4172 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004173 requiredsize = *outpos;
4174 if (requiredsize > PY_SSIZE_T_MAX - replen)
4175 goto overflow;
4176 requiredsize += replen;
4177 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4178 goto overflow;
4179 requiredsize += insize - newpos;
Victor Stinner596a6c42011-11-09 00:02:18 +01004180 if (requiredsize > outsize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004181 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinner596a6c42011-11-09 00:02:18 +01004182 requiredsize = 2*outsize;
4183 if (unicode_resize(output, requiredsize) < 0)
4184 goto onError;
4185 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004186 if (unicode_widen(output, *outpos,
4187 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004189 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004190 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004192 else {
4193 wchar_t *repwstr;
4194 Py_ssize_t repwlen;
4195 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4196 if (repwstr == NULL)
4197 goto onError;
4198 /* need more space? (at least enough for what we
4199 have+the replacement+the rest of the string (starting
4200 at the new input position), so we won't have to check space
4201 when there are no errors in the rest of the string) */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004202 requiredsize = *outpos;
4203 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4204 goto overflow;
4205 requiredsize += repwlen;
4206 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4207 goto overflow;
4208 requiredsize += insize - newpos;
Victor Stinner596a6c42011-11-09 00:02:18 +01004209 if (requiredsize > outsize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004210 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinner596a6c42011-11-09 00:02:18 +01004211 requiredsize = 2*outsize;
4212 if (unicode_resize(output, requiredsize) < 0)
4213 goto onError;
4214 }
4215 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4216 *outpos += repwlen;
4217 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004218 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004219 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004220
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 /* we made it! */
4222 res = 0;
4223
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 Py_XDECREF(restuple);
4226 return res;
Benjamin Petersona1c1be42014-09-29 18:18:57 -04004227
4228 overflow:
4229 PyErr_SetString(PyExc_OverflowError,
4230 "decoded result is too long for a Python string");
4231 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232}
4233
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004234/* --- UTF-7 Codec -------------------------------------------------------- */
4235
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236/* See RFC2152 for details. We encode conservatively and decode liberally. */
4237
4238/* Three simple macros defining base-64. */
4239
4240/* Is c a base-64 character? */
4241
4242#define IS_BASE64(c) \
4243 (((c) >= 'A' && (c) <= 'Z') || \
4244 ((c) >= 'a' && (c) <= 'z') || \
4245 ((c) >= '0' && (c) <= '9') || \
4246 (c) == '+' || (c) == '/')
4247
4248/* given that c is a base-64 character, what is its base-64 value? */
4249
4250#define FROM_BASE64(c) \
4251 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4252 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4253 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4254 (c) == '+' ? 62 : 63)
4255
4256/* What is the base-64 character of the bottom 6 bits of n? */
4257
4258#define TO_BASE64(n) \
4259 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4260
4261/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4262 * decoded as itself. We are permissive on decoding; the only ASCII
4263 * byte not decoding to itself is the + which begins a base64
4264 * string. */
4265
4266#define DECODE_DIRECT(c) \
4267 ((c) <= 127 && (c) != '+')
4268
4269/* The UTF-7 encoder treats ASCII characters differently according to
4270 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4271 * the above). See RFC2152. This array identifies these different
4272 * sets:
4273 * 0 : "Set D"
4274 * alphanumeric and '(),-./:?
4275 * 1 : "Set O"
4276 * !"#$%&*;<=>@[]^_`{|}
4277 * 2 : "whitespace"
4278 * ht nl cr sp
4279 * 3 : special (must be base64 encoded)
4280 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4281 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Tim Petersced69f82003-09-16 20:30:58 +00004283static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284char utf7_category[128] = {
4285/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4286 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4287/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4288 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4289/* sp ! " # $ % & ' ( ) * + , - . / */
4290 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4291/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4293/* @ A B C D E F G H I J K L M N O */
4294 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4295/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4297/* ` a b c d e f g h i j k l m n o */
4298 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4299/* p q r s t u v w x y z { | } ~ del */
4300 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004301};
4302
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303/* ENCODE_DIRECT: this character should be encoded as itself. The
4304 * answer depends on whether we are encoding set O as itself, and also
4305 * on whether we are encoding whitespace as itself. RFC2152 makes it
4306 * clear that the answers to these questions vary between
4307 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004308
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309#define ENCODE_DIRECT(c, directO, directWS) \
4310 ((c) < 128 && (c) > 0 && \
4311 ((utf7_category[(c)] == 0) || \
4312 (directWS && (utf7_category[(c)] == 2)) || \
4313 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314
Alexander Belopolsky40018472011-02-26 01:02:56 +00004315PyObject *
4316PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004317 Py_ssize_t size,
4318 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4321}
4322
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323/* The decoder. The only state we preserve is our read position,
4324 * i.e. how many characters we have consumed. So if we end in the
4325 * middle of a shift sequence we have to back off the read position
4326 * and the output to the beginning of the sequence, otherwise we lose
4327 * all the shift state (seen bits, number of bits seen, high
4328 * surrogate). */
4329
Alexander Belopolsky40018472011-02-26 01:02:56 +00004330PyObject *
4331PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004332 Py_ssize_t size,
4333 const char *errors,
4334 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004335{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004337 Py_ssize_t startinpos;
4338 Py_ssize_t endinpos;
4339 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004341 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342 const char *errmsg = "";
4343 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004344 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 unsigned int base64bits = 0;
4346 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004347 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 PyObject *errorHandler = NULL;
4349 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 /* Start off assuming it's all ASCII. Widen later as necessary. */
4352 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 if (!unicode)
4354 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004355 if (size == 0) {
4356 if (consumed)
4357 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004358 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004359 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004360
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362 e = s + size;
4363
4364 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004366 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004367 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 if (inShift) { /* in a base-64 section */
4370 if (IS_BASE64(ch)) { /* consume a base-64 character */
4371 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4372 base64bits += 6;
4373 s++;
4374 if (base64bits >= 16) {
4375 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004376 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 base64bits -= 16;
4378 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004379 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (surrogate) {
4381 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004382 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4383 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4385 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004387 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 }
4389 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004390 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4391 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 }
4394 }
Victor Stinner551ac952011-11-29 22:58:13 +01004395 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 /* first surrogate */
4397 surrogate = outCh;
4398 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004400 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4401 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 }
4403 }
4404 }
4405 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 inShift = 0;
4407 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004409 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4410 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004411 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 if (base64bits > 0) { /* left-over bits */
4414 if (base64bits >= 6) {
4415 /* We've seen at least one base-64 character */
4416 errmsg = "partial character in shift sequence";
4417 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004418 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 else {
4420 /* Some bits remain; they should be zero */
4421 if (base64buffer != 0) {
4422 errmsg = "non-zero padding bits in shift sequence";
4423 goto utf7Error;
4424 }
4425 }
4426 }
4427 if (ch != '-') {
4428 /* '-' is absorbed; other terminating
4429 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004430 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4431 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 }
4434 }
4435 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004437 s++; /* consume '+' */
4438 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004440 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4441 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 }
4443 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004445 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004447 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448 }
4449 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004451 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4452 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453 s++;
4454 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 else {
4456 startinpos = s-starts;
4457 s++;
4458 errmsg = "unexpected special character";
4459 goto utf7Error;
4460 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004461 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 endinpos = s-starts;
4464 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 errors, &errorHandler,
4466 "utf7", errmsg,
4467 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
4471
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 /* end of string */
4473
4474 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4475 /* if we're in an inconsistent state, that's an error */
4476 if (surrogate ||
4477 (base64bits >= 6) ||
4478 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 endinpos = size;
4480 if (unicode_decode_call_errorhandler(
4481 errors, &errorHandler,
4482 "utf7", "unterminated shift sequence",
4483 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004484 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004485 goto onError;
4486 if (s < e)
4487 goto restart;
4488 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490
4491 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004492 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004494 *consumed = startinpos;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004495 if (outpos != shiftOutStart &&
4496 PyUnicode_MAX_CHAR_VALUE(unicode) > 127) {
4497 PyObject *result = PyUnicode_FromKindAndData(
4498 PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4499 shiftOutStart);
4500 Py_DECREF(unicode);
4501 unicode = result;
4502 }
4503 outpos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 }
4505 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004506 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004508 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004510 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 goto onError;
4512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 Py_XDECREF(errorHandler);
4514 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004515 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518 Py_XDECREF(errorHandler);
4519 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 Py_DECREF(unicode);
4521 return NULL;
4522}
4523
4524
Alexander Belopolsky40018472011-02-26 01:02:56 +00004525PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004526_PyUnicode_EncodeUTF7(PyObject *str,
4527 int base64SetO,
4528 int base64WhiteSpace,
4529 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004531 int kind;
4532 void *data;
4533 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004534 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004536 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 unsigned int base64bits = 0;
4538 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004539 char * out;
4540 char * start;
4541
Benjamin Petersonbac79492012-01-14 13:34:47 -05004542 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004543 return NULL;
4544 kind = PyUnicode_KIND(str);
4545 data = PyUnicode_DATA(str);
4546 len = PyUnicode_GET_LENGTH(str);
4547
4548 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004551 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004552 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004553 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004554 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 if (v == NULL)
4556 return NULL;
4557
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004558 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004559 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004560 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 if (inShift) {
4563 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4564 /* shifting out */
4565 if (base64bits) { /* output remaining bits */
4566 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4567 base64buffer = 0;
4568 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569 }
4570 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 /* Characters not in the BASE64 set implicitly unshift the sequence
4572 so no '-' is required, except if the character is itself a '-' */
4573 if (IS_BASE64(ch) || ch == '-') {
4574 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 *out++ = (char) ch;
4577 }
4578 else {
4579 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004580 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 else { /* not in a shift sequence */
4583 if (ch == '+') {
4584 *out++ = '+';
4585 *out++ = '-';
4586 }
4587 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4588 *out++ = (char) ch;
4589 }
4590 else {
4591 *out++ = '+';
4592 inShift = 1;
4593 goto encode_char;
4594 }
4595 }
4596 continue;
4597encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004599 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004600
Antoine Pitrou244651a2009-05-04 18:56:13 +00004601 /* code first surrogate */
4602 base64bits += 16;
4603 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4604 while (base64bits >= 6) {
4605 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4606 base64bits -= 6;
4607 }
4608 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004609 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 base64bits += 16;
4612 base64buffer = (base64buffer << 16) | ch;
4613 while (base64bits >= 6) {
4614 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4615 base64bits -= 6;
4616 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004617 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 if (base64bits)
4619 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4620 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004622 if (_PyBytes_Resize(&v, out - start) < 0)
4623 return NULL;
4624 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004626PyObject *
4627PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4628 Py_ssize_t size,
4629 int base64SetO,
4630 int base64WhiteSpace,
4631 const char *errors)
4632{
4633 PyObject *result;
4634 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4635 if (tmp == NULL)
4636 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004637 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004638 base64WhiteSpace, errors);
4639 Py_DECREF(tmp);
4640 return result;
4641}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643#undef IS_BASE64
4644#undef FROM_BASE64
4645#undef TO_BASE64
4646#undef DECODE_DIRECT
4647#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649/* --- UTF-8 Codec -------------------------------------------------------- */
4650
Alexander Belopolsky40018472011-02-26 01:02:56 +00004651PyObject *
4652PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004653 Py_ssize_t size,
4654 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655{
Walter Dörwald69652032004-09-07 20:24:22 +00004656 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4657}
4658
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659#include "stringlib/asciilib.h"
4660#include "stringlib/codecs.h"
4661#include "stringlib/undef.h"
4662
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004663#include "stringlib/ucs1lib.h"
4664#include "stringlib/codecs.h"
4665#include "stringlib/undef.h"
4666
4667#include "stringlib/ucs2lib.h"
4668#include "stringlib/codecs.h"
4669#include "stringlib/undef.h"
4670
4671#include "stringlib/ucs4lib.h"
4672#include "stringlib/codecs.h"
4673#include "stringlib/undef.h"
4674
Antoine Pitrouab868312009-01-10 15:40:25 +00004675/* Mask to quickly check whether a C 'long' contains a
4676 non-ASCII, UTF8-encoded char. */
4677#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004678# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004679#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004680# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004681#else
4682# error C 'long' size should be either 4 or 8!
4683#endif
4684
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685static Py_ssize_t
4686ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004687{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004689 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004690
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004691 /*
4692 * Issue #17237: m68k is a bit different from most architectures in
4693 * that objects do not use "natural alignment" - for example, int and
4694 * long are only aligned at 2-byte boundaries. Therefore the assert()
4695 * won't work; also, tests have shown that skipping the "optimised
4696 * version" will even speed up m68k.
4697 */
4698#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004700 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4701 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 /* Fast path, see in STRINGLIB(utf8_decode) for
4703 an explanation. */
4704 /* Help register allocation */
4705 register const char *_p = p;
4706 register Py_UCS1 * q = dest;
4707 while (_p < aligned_end) {
4708 unsigned long value = *(const unsigned long *) _p;
4709 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004710 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 *((unsigned long *)q) = value;
4712 _p += SIZEOF_LONG;
4713 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004714 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004715 p = _p;
4716 while (p < end) {
4717 if ((unsigned char)*p & 0x80)
4718 break;
4719 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004724#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 while (p < end) {
4726 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4727 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004728 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 /* Help register allocation */
4730 register const char *_p = p;
4731 while (_p < aligned_end) {
4732 unsigned long value = *(unsigned long *) _p;
4733 if (value & ASCII_CHAR_MASK)
4734 break;
4735 _p += SIZEOF_LONG;
4736 }
4737 p = _p;
4738 if (_p == end)
4739 break;
4740 }
4741 if ((unsigned char)*p & 0x80)
4742 break;
4743 ++p;
4744 }
4745 memcpy(dest, start, p - start);
4746 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747}
Antoine Pitrouab868312009-01-10 15:40:25 +00004748
Victor Stinner785938e2011-12-11 20:09:03 +01004749PyObject *
4750PyUnicode_DecodeUTF8Stateful(const char *s,
4751 Py_ssize_t size,
4752 const char *errors,
4753 Py_ssize_t *consumed)
4754{
Victor Stinner785938e2011-12-11 20:09:03 +01004755 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004756 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 const char *end = s + size;
4758 Py_ssize_t outpos;
4759
4760 Py_ssize_t startinpos;
4761 Py_ssize_t endinpos;
4762 const char *errmsg = "";
4763 PyObject *errorHandler = NULL;
4764 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004765
4766 if (size == 0) {
4767 if (consumed)
4768 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004769 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004770 }
4771
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4773 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004774 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 *consumed = 1;
4776 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004777 }
4778
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004780 if (!unicode)
4781 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004782
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4784 s += outpos;
4785 while (s < end) {
4786 Py_UCS4 ch;
4787 int kind = PyUnicode_KIND(unicode);
4788 if (kind == PyUnicode_1BYTE_KIND) {
4789 if (PyUnicode_IS_ASCII(unicode))
4790 ch = asciilib_utf8_decode(&s, end,
4791 PyUnicode_1BYTE_DATA(unicode), &outpos);
4792 else
4793 ch = ucs1lib_utf8_decode(&s, end,
4794 PyUnicode_1BYTE_DATA(unicode), &outpos);
4795 } else if (kind == PyUnicode_2BYTE_KIND) {
4796 ch = ucs2lib_utf8_decode(&s, end,
4797 PyUnicode_2BYTE_DATA(unicode), &outpos);
4798 } else {
4799 assert(kind == PyUnicode_4BYTE_KIND);
4800 ch = ucs4lib_utf8_decode(&s, end,
4801 PyUnicode_4BYTE_DATA(unicode), &outpos);
4802 }
4803
4804 switch (ch) {
4805 case 0:
4806 if (s == end || consumed)
4807 goto End;
4808 errmsg = "unexpected end of data";
4809 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004810 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 break;
4812 case 1:
4813 errmsg = "invalid start byte";
4814 startinpos = s - starts;
4815 endinpos = startinpos + 1;
4816 break;
4817 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004818 case 3:
4819 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 errmsg = "invalid continuation byte";
4821 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004822 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 break;
4824 default:
4825 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4826 goto onError;
4827 continue;
4828 }
4829
4830 if (unicode_decode_call_errorhandler(
4831 errors, &errorHandler,
4832 "utf-8", errmsg,
4833 &starts, &end, &startinpos, &endinpos, &exc, &s,
4834 &unicode, &outpos))
4835 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004836 }
4837
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838End:
4839 if (unicode_resize(&unicode, outpos) < 0)
4840 goto onError;
4841
4842 if (consumed)
4843 *consumed = s - starts;
4844
4845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
4847 assert(_PyUnicode_CheckConsistency(unicode, 1));
4848 return unicode;
4849
4850onError:
4851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
4853 Py_XDECREF(unicode);
4854 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004855}
4856
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004857#ifdef __APPLE__
4858
4859/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004860 used to decode the command line arguments on Mac OS X.
4861
4862 Return a pointer to a newly allocated wide character string (use
4863 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004864
4865wchar_t*
4866_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4867{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004868 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 wchar_t *unicode;
4870 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004871
4872 /* Note: size will always be longer than the resulting Unicode
4873 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004874 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004875 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004876 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4877 if (!unicode)
4878 return NULL;
4879
4880 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004881 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004883 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004885#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004887#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004889#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 if (ch > 0xFF) {
4891#if SIZEOF_WCHAR_T == 4
4892 assert(0);
4893#else
4894 assert(Py_UNICODE_IS_SURROGATE(ch));
4895 /* compute and append the two surrogates: */
4896 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4897 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4898#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004899 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900 else {
4901 if (!ch && s == e)
4902 break;
4903 /* surrogateescape */
4904 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4905 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004906 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004908 return unicode;
4909}
4910
4911#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913/* Primary internal function which creates utf8 encoded bytes objects.
4914
4915 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004916 and allocate exactly as much space needed at the end. Else allocate the
4917 maximum possible needed (4 result bytes per Unicode character), and return
4918 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004919*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004920PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004921_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922{
Victor Stinner6099a032011-12-18 14:22:26 +01004923 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004924 void *data;
4925 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927 if (!PyUnicode_Check(unicode)) {
4928 PyErr_BadArgument();
4929 return NULL;
4930 }
4931
4932 if (PyUnicode_READY(unicode) == -1)
4933 return NULL;
4934
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004935 if (PyUnicode_UTF8(unicode))
4936 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4937 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004938
4939 kind = PyUnicode_KIND(unicode);
4940 data = PyUnicode_DATA(unicode);
4941 size = PyUnicode_GET_LENGTH(unicode);
4942
Benjamin Petersonead6b532011-12-20 17:23:42 -06004943 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004944 default:
4945 assert(0);
4946 case PyUnicode_1BYTE_KIND:
4947 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4948 assert(!PyUnicode_IS_ASCII(unicode));
4949 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4950 case PyUnicode_2BYTE_KIND:
4951 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4952 case PyUnicode_4BYTE_KIND:
4953 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955}
4956
Alexander Belopolsky40018472011-02-26 01:02:56 +00004957PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004958PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4959 Py_ssize_t size,
4960 const char *errors)
4961{
4962 PyObject *v, *unicode;
4963
4964 unicode = PyUnicode_FromUnicode(s, size);
4965 if (unicode == NULL)
4966 return NULL;
4967 v = _PyUnicode_AsUTF8String(unicode, errors);
4968 Py_DECREF(unicode);
4969 return v;
4970}
4971
4972PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004973PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004975 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976}
4977
Walter Dörwald41980ca2007-08-16 21:55:45 +00004978/* --- UTF-32 Codec ------------------------------------------------------- */
4979
4980PyObject *
4981PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 Py_ssize_t size,
4983 const char *errors,
4984 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985{
4986 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4987}
4988
4989PyObject *
4990PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 Py_ssize_t size,
4992 const char *errors,
4993 int *byteorder,
4994 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004995{
4996 const char *starts = s;
4997 Py_ssize_t startinpos;
4998 Py_ssize_t endinpos;
4999 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005000 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005001 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 int bo = 0; /* assume native ordering by default */
5003 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 /* Offsets from q for retrieving bytes in the right order. */
5005#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5006 int iorder[] = {0, 1, 2, 3};
5007#else
5008 int iorder[] = {3, 2, 1, 0};
5009#endif
5010 PyObject *errorHandler = NULL;
5011 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005012
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013 q = (unsigned char *)s;
5014 e = q + size;
5015
5016 if (byteorder)
5017 bo = *byteorder;
5018
5019 /* Check for BOM marks (U+FEFF) in the input and adjust current
5020 byte order setting accordingly. In native mode, the leading BOM
5021 mark is skipped, in all other modes, it is copied to the output
5022 stream as-is (giving a ZWNBSP character). */
5023 if (bo == 0) {
5024 if (size >= 4) {
5025 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 if (bom == 0x0000FEFF) {
5029 q += 4;
5030 bo = -1;
5031 }
5032 else if (bom == 0xFFFE0000) {
5033 q += 4;
5034 bo = 1;
5035 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 if (bom == 0x0000FEFF) {
5038 q += 4;
5039 bo = 1;
5040 }
5041 else if (bom == 0xFFFE0000) {
5042 q += 4;
5043 bo = -1;
5044 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047 }
5048
5049 if (bo == -1) {
5050 /* force LE */
5051 iorder[0] = 0;
5052 iorder[1] = 1;
5053 iorder[2] = 2;
5054 iorder[3] = 3;
5055 }
5056 else if (bo == 1) {
5057 /* force BE */
5058 iorder[0] = 3;
5059 iorder[1] = 2;
5060 iorder[2] = 1;
5061 iorder[3] = 0;
5062 }
5063
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005064 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005065 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005066 if (!unicode)
5067 return NULL;
5068 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005069 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005070 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005071
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 Py_UCS4 ch;
5074 /* remaining bytes at the end? (size should be divisible by 4) */
5075 if (e-q<4) {
5076 if (consumed)
5077 break;
5078 errmsg = "truncated data";
5079 startinpos = ((const char *)q)-starts;
5080 endinpos = ((const char *)e)-starts;
5081 goto utf32Error;
5082 /* The remaining input chars are ignored if the callback
5083 chooses to skip the input */
5084 }
5085 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5086 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 if (ch >= 0x110000)
5089 {
5090 errmsg = "codepoint not in range(0x110000)";
5091 startinpos = ((const char *)q)-starts;
5092 endinpos = startinpos+4;
5093 goto utf32Error;
5094 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005095 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5096 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 q += 4;
5098 continue;
5099 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 if (unicode_decode_call_errorhandler(
5101 errors, &errorHandler,
5102 "utf32", errmsg,
5103 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005104 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 }
5107
5108 if (byteorder)
5109 *byteorder = bo;
5110
5111 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113
5114 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005115 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 goto onError;
5117
5118 Py_XDECREF(errorHandler);
5119 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005120 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005123 Py_DECREF(unicode);
5124 Py_XDECREF(errorHandler);
5125 Py_XDECREF(exc);
5126 return NULL;
5127}
5128
5129PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005130_PyUnicode_EncodeUTF32(PyObject *str,
5131 const char *errors,
5132 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005133{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 int kind;
5135 void *data;
5136 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005137 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005139 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140 /* Offsets from p for storing byte pairs in the right order. */
5141#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5142 int iorder[] = {0, 1, 2, 3};
5143#else
5144 int iorder[] = {3, 2, 1, 0};
5145#endif
5146
Benjamin Peterson29060642009-01-31 22:14:21 +00005147#define STORECHAR(CH) \
5148 do { \
5149 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5150 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5151 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5152 p[iorder[0]] = (CH) & 0xff; \
5153 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005154 } while(0)
5155
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005156 if (!PyUnicode_Check(str)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005160 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005161 return NULL;
5162 kind = PyUnicode_KIND(str);
5163 data = PyUnicode_DATA(str);
5164 len = PyUnicode_GET_LENGTH(str);
5165
5166 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005167 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005169 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005170 if (v == NULL)
5171 return NULL;
5172
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005173 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005174 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005176 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005178
5179 if (byteorder == -1) {
5180 /* force LE */
5181 iorder[0] = 0;
5182 iorder[1] = 1;
5183 iorder[2] = 2;
5184 iorder[3] = 3;
5185 }
5186 else if (byteorder == 1) {
5187 /* force BE */
5188 iorder[0] = 3;
5189 iorder[1] = 2;
5190 iorder[2] = 1;
5191 iorder[3] = 0;
5192 }
5193
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005194 for (i = 0; i < len; i++)
5195 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005196
5197 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005198 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005199#undef STORECHAR
5200}
5201
Alexander Belopolsky40018472011-02-26 01:02:56 +00005202PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005203PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5204 Py_ssize_t size,
5205 const char *errors,
5206 int byteorder)
5207{
5208 PyObject *result;
5209 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5210 if (tmp == NULL)
5211 return NULL;
5212 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5213 Py_DECREF(tmp);
5214 return result;
5215}
5216
5217PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005218PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219{
Victor Stinnerb960b342011-11-20 19:12:52 +01005220 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005221}
5222
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223/* --- UTF-16 Codec ------------------------------------------------------- */
5224
Tim Peters772747b2001-08-09 22:21:55 +00005225PyObject *
5226PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 Py_ssize_t size,
5228 const char *errors,
5229 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Walter Dörwald69652032004-09-07 20:24:22 +00005231 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5232}
5233
5234PyObject *
5235PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 Py_ssize_t size,
5237 const char *errors,
5238 int *byteorder,
5239 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005240{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005242 Py_ssize_t startinpos;
5243 Py_ssize_t endinpos;
5244 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005245 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005247 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005248 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005249 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
Tim Peters772747b2001-08-09 22:21:55 +00005253 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005254 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
5256 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005257 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005259 /* Check for BOM marks (U+FEFF) in the input and adjust current
5260 byte order setting accordingly. In native mode, the leading BOM
5261 mark is skipped, in all other modes, it is copied to the output
5262 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005263 if (bo == 0 && size >= 2) {
5264 const Py_UCS4 bom = (q[1] << 8) | q[0];
5265 if (bom == 0xFEFF) {
5266 q += 2;
5267 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005269 else if (bom == 0xFFFE) {
5270 q += 2;
5271 bo = 1;
5272 }
5273 if (byteorder)
5274 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
Antoine Pitrou63065d72012-05-15 23:48:04 +02005277 if (q == e) {
5278 if (consumed)
5279 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005280 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005281 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282
Antoine Pitrouab868312009-01-10 15:40:25 +00005283#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005284 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005285#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005286 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005287#endif
Tim Peters772747b2001-08-09 22:21:55 +00005288
Antoine Pitrou63065d72012-05-15 23:48:04 +02005289 /* Note: size will always be longer than the resulting Unicode
5290 character count */
5291 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5292 if (!unicode)
5293 return NULL;
5294
5295 outpos = 0;
5296 while (1) {
5297 Py_UCS4 ch = 0;
5298 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005299 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 if (kind == PyUnicode_1BYTE_KIND) {
5301 if (PyUnicode_IS_ASCII(unicode))
5302 ch = asciilib_utf16_decode(&q, e,
5303 PyUnicode_1BYTE_DATA(unicode), &outpos,
5304 native_ordering);
5305 else
5306 ch = ucs1lib_utf16_decode(&q, e,
5307 PyUnicode_1BYTE_DATA(unicode), &outpos,
5308 native_ordering);
5309 } else if (kind == PyUnicode_2BYTE_KIND) {
5310 ch = ucs2lib_utf16_decode(&q, e,
5311 PyUnicode_2BYTE_DATA(unicode), &outpos,
5312 native_ordering);
5313 } else {
5314 assert(kind == PyUnicode_4BYTE_KIND);
5315 ch = ucs4lib_utf16_decode(&q, e,
5316 PyUnicode_4BYTE_DATA(unicode), &outpos,
5317 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005320
Antoine Pitrou63065d72012-05-15 23:48:04 +02005321 switch (ch)
5322 {
5323 case 0:
5324 /* remaining byte at the end? (size should be even) */
5325 if (q == e || consumed)
5326 goto End;
5327 errmsg = "truncated data";
5328 startinpos = ((const char *)q) - starts;
5329 endinpos = ((const char *)e) - starts;
5330 break;
5331 /* The remaining input chars are ignored if the callback
5332 chooses to skip the input */
5333 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005334 q -= 2;
5335 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005336 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005338 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339 endinpos = ((const char *)e) - starts;
5340 break;
5341 case 2:
5342 errmsg = "illegal encoding";
5343 startinpos = ((const char *)q) - 2 - starts;
5344 endinpos = startinpos + 2;
5345 break;
5346 case 3:
5347 errmsg = "illegal UTF-16 surrogate";
5348 startinpos = ((const char *)q) - 4 - starts;
5349 endinpos = startinpos + 2;
5350 break;
5351 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005352 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5353 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 continue;
5355 }
5356
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005358 errors,
5359 &errorHandler,
5360 "utf16", errmsg,
5361 &starts,
5362 (const char **)&e,
5363 &startinpos,
5364 &endinpos,
5365 &exc,
5366 (const char **)&q,
5367 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005368 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 }
5371
Antoine Pitrou63065d72012-05-15 23:48:04 +02005372End:
Walter Dörwald69652032004-09-07 20:24:22 +00005373 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005377 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 goto onError;
5379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 Py_XDECREF(errorHandler);
5381 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005382 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 Py_XDECREF(errorHandler);
5387 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 return NULL;
5389}
5390
Tim Peters772747b2001-08-09 22:21:55 +00005391PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005392_PyUnicode_EncodeUTF16(PyObject *str,
5393 const char *errors,
5394 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005396 enum PyUnicode_Kind kind;
5397 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005398 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005399 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005400 unsigned short *out;
5401 Py_ssize_t bytesize;
5402 Py_ssize_t pairs;
5403#ifdef WORDS_BIGENDIAN
5404 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005405#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005406 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005407#endif
5408
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005409 if (!PyUnicode_Check(str)) {
5410 PyErr_BadArgument();
5411 return NULL;
5412 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005413 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005414 return NULL;
5415 kind = PyUnicode_KIND(str);
5416 data = PyUnicode_DATA(str);
5417 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005420 if (kind == PyUnicode_4BYTE_KIND) {
5421 const Py_UCS4 *in = (const Py_UCS4 *)data;
5422 const Py_UCS4 *end = in + len;
5423 while (in < end)
5424 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005426 }
5427 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005429 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005430 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 if (v == NULL)
5432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005435 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005438 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005439 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005440 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005441
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005442 switch (kind) {
5443 case PyUnicode_1BYTE_KIND: {
5444 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5445 break;
Tim Peters772747b2001-08-09 22:21:55 +00005446 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005447 case PyUnicode_2BYTE_KIND: {
5448 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5449 break;
Tim Peters772747b2001-08-09 22:21:55 +00005450 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005451 case PyUnicode_4BYTE_KIND: {
5452 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5453 break;
5454 }
5455 default:
5456 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005457 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005458
5459 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005460 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461}
5462
Alexander Belopolsky40018472011-02-26 01:02:56 +00005463PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005464PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5465 Py_ssize_t size,
5466 const char *errors,
5467 int byteorder)
5468{
5469 PyObject *result;
5470 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5471 if (tmp == NULL)
5472 return NULL;
5473 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5474 Py_DECREF(tmp);
5475 return result;
5476}
5477
5478PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005479PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005481 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482}
5483
5484/* --- Unicode Escape Codec ----------------------------------------------- */
5485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5487 if all the escapes in the string make it still a valid ASCII string.
5488 Returns -1 if any escapes were found which cause the string to
5489 pop out of ASCII range. Otherwise returns the length of the
5490 required buffer to hold the string.
5491 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005492static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005493length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5494{
5495 const unsigned char *p = (const unsigned char *)s;
5496 const unsigned char *end = p + size;
5497 Py_ssize_t length = 0;
5498
5499 if (size < 0)
5500 return -1;
5501
5502 for (; p < end; ++p) {
5503 if (*p > 127) {
5504 /* Non-ASCII */
5505 return -1;
5506 }
5507 else if (*p != '\\') {
5508 /* Normal character */
5509 ++length;
5510 }
5511 else {
5512 /* Backslash-escape, check next char */
5513 ++p;
5514 /* Escape sequence reaches till end of string or
5515 non-ASCII follow-up. */
5516 if (p >= end || *p > 127)
5517 return -1;
5518 switch (*p) {
5519 case '\n':
5520 /* backslash + \n result in zero characters */
5521 break;
5522 case '\\': case '\'': case '\"':
5523 case 'b': case 'f': case 't':
5524 case 'n': case 'r': case 'v': case 'a':
5525 ++length;
5526 break;
5527 case '0': case '1': case '2': case '3':
5528 case '4': case '5': case '6': case '7':
5529 case 'x': case 'u': case 'U': case 'N':
5530 /* these do not guarantee ASCII characters */
5531 return -1;
5532 default:
5533 /* count the backslash + the other character */
5534 length += 2;
5535 }
5536 }
5537 }
5538 return length;
5539}
5540
Fredrik Lundh06d12682001-01-24 07:59:11 +00005541static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005542
Alexander Belopolsky40018472011-02-26 01:02:56 +00005543PyObject *
5544PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005545 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005546 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005549 Py_ssize_t startinpos;
5550 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005551 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553 char* message;
5554 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005555 PyObject *errorHandler = NULL;
5556 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005557 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005559
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005560 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561
5562 /* After length_of_escaped_ascii_string() there are two alternatives,
5563 either the string is pure ASCII with named escapes like \n, etc.
5564 and we determined it's exact size (common case)
5565 or it contains \x, \u, ... escape sequences. then we create a
5566 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005567 if (len >= 0) {
5568 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569 if (!v)
5570 goto onError;
5571 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572 }
5573 else {
5574 /* Escaped strings will always be longer than the resulting
5575 Unicode string, so we start with size here and then reduce the
5576 length after conversion to the true value.
5577 (but if the error callback returns a long replacement string
5578 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005579 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580 if (!v)
5581 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005582 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583 }
5584
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005586 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 while (s < end) {
5591 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005592 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005593 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005595 /* The only case in which i == ascii_length is a backslash
5596 followed by a newline. */
5597 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 /* Non-escape characters are interpreted as Unicode ordinals */
5600 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5602 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 continue;
5604 }
5605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 /* \ - Escapes */
5608 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005609 c = *s++;
5610 if (s > end)
5611 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005612
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005613 /* The only case in which i == ascii_length is a backslash
5614 followed by a newline. */
5615 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005616
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005617 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
Benjamin Peterson29060642009-01-31 22:14:21 +00005619 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620#define WRITECHAR(ch) \
5621 do { \
5622 if (unicode_putchar(&v, &i, ch) < 0) \
5623 goto onError; \
5624 }while(0)
5625
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005627 case '\\': WRITECHAR('\\'); break;
5628 case '\'': WRITECHAR('\''); break;
5629 case '\"': WRITECHAR('\"'); break;
5630 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005632 case 'f': WRITECHAR('\014'); break;
5633 case 't': WRITECHAR('\t'); break;
5634 case 'n': WRITECHAR('\n'); break;
5635 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005639 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 case '0': case '1': case '2': case '3':
5643 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005644 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005645 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005646 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005647 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005648 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005650 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 break;
5652
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 /* hex escapes */
5654 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005656 digits = 2;
5657 message = "truncated \\xXX escape";
5658 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005662 digits = 4;
5663 message = "truncated \\uXXXX escape";
5664 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005667 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005668 digits = 8;
5669 message = "truncated \\UXXXXXXXX escape";
5670 hexescape:
5671 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005672 if (end - s < digits) {
5673 /* count only hex digits */
5674 for (; s < end; ++s) {
5675 c = (unsigned char)*s;
5676 if (!Py_ISXDIGIT(c))
5677 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005678 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005679 goto error;
5680 }
5681 for (; digits--; ++s) {
5682 c = (unsigned char)*s;
5683 if (!Py_ISXDIGIT(c))
5684 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005685 chr = (chr<<4) & ~0xF;
5686 if (c >= '0' && c <= '9')
5687 chr += c - '0';
5688 else if (c >= 'a' && c <= 'f')
5689 chr += 10 + c - 'a';
5690 else
5691 chr += 10 + c - 'A';
5692 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005693 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 /* _decoding_error will have already written into the
5695 target buffer. */
5696 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005697 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005698 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005699 message = "illegal Unicode character";
5700 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005701 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005702 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005703 break;
5704
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005706 case 'N':
5707 message = "malformed \\N character escape";
5708 if (ucnhash_CAPI == NULL) {
5709 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005710 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5711 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005712 if (ucnhash_CAPI == NULL)
5713 goto ucnhashError;
5714 }
5715 if (*s == '{') {
5716 const char *start = s+1;
5717 /* look for the closing brace */
5718 while (*s != '}' && s < end)
5719 s++;
5720 if (s > start && s < end && *s == '}') {
5721 /* found a name. look it up in the unicode database */
5722 message = "unknown Unicode character name";
5723 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005724 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005725 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005726 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005727 goto store;
5728 }
5729 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005730 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731
5732 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005733 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 message = "\\ at end of string";
5735 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005736 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005737 }
5738 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005739 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005740 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005741 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005742 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005744 continue;
5745
5746 error:
5747 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005748 if (unicode_decode_call_errorhandler(
5749 errors, &errorHandler,
5750 "unicodeescape", message,
5751 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005752 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005753 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005754 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005755 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005757#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758
Victor Stinner16e6a802011-12-12 13:24:15 +01005759 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005760 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005761 Py_XDECREF(errorHandler);
5762 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005763 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005764
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005766 PyErr_SetString(
5767 PyExc_UnicodeError,
5768 "\\N escapes not supported (can't load unicodedata module)"
5769 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005770 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 Py_XDECREF(errorHandler);
5772 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005773 return NULL;
5774
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 Py_XDECREF(errorHandler);
5778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 return NULL;
5780}
5781
5782/* Return a Unicode-Escape string version of the Unicode object.
5783
5784 If quotes is true, the string is enclosed in u"" or u'' quotes as
5785 appropriate.
5786
5787*/
5788
Alexander Belopolsky40018472011-02-26 01:02:56 +00005789PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005792 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005793 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795 int kind;
5796 void *data;
5797 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
Ezio Melottie7f90372012-10-05 03:33:31 +03005799 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005800 escape.
5801
Ezio Melottie7f90372012-10-05 03:33:31 +03005802 For UCS1 strings it's '\xxx', 4 bytes per source character.
5803 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5804 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005805 */
5806
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005807 if (!PyUnicode_Check(unicode)) {
5808 PyErr_BadArgument();
5809 return NULL;
5810 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005811 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005812 return NULL;
5813 len = PyUnicode_GET_LENGTH(unicode);
5814 kind = PyUnicode_KIND(unicode);
5815 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005816 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005817 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5818 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5819 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5820 }
5821
5822 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005823 return PyBytes_FromStringAndSize(NULL, 0);
5824
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005825 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005827
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005830 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 if (repr == NULL)
5833 return NULL;
5834
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005835 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005837 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005838 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005839
Walter Dörwald79e913e2007-05-12 11:08:06 +00005840 /* Escape backslashes */
5841 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 *p++ = '\\';
5843 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005844 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005846
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005847 /* Map 21-bit characters to '\U00xxxxxx' */
5848 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005849 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005850 *p++ = '\\';
5851 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005852 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5853 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5854 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5855 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5856 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5857 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5858 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5859 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005861 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005864 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 *p++ = '\\';
5866 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005867 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5868 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5869 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5870 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005872
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005873 /* Map special whitespace to '\t', \n', '\r' */
5874 else if (ch == '\t') {
5875 *p++ = '\\';
5876 *p++ = 't';
5877 }
5878 else if (ch == '\n') {
5879 *p++ = '\\';
5880 *p++ = 'n';
5881 }
5882 else if (ch == '\r') {
5883 *p++ = '\\';
5884 *p++ = 'r';
5885 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005886
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005887 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005888 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005890 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005891 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5892 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005893 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005894
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 /* Copy everything else as-is */
5896 else
5897 *p++ = (char) ch;
5898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005900 assert(p - PyBytes_AS_STRING(repr) > 0);
5901 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5902 return NULL;
5903 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904}
5905
Alexander Belopolsky40018472011-02-26 01:02:56 +00005906PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5908 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 PyObject *result;
5911 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5912 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 result = PyUnicode_AsUnicodeEscapeString(tmp);
5915 Py_DECREF(tmp);
5916 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917}
5918
5919/* --- Raw Unicode Escape Codec ------------------------------------------- */
5920
Alexander Belopolsky40018472011-02-26 01:02:56 +00005921PyObject *
5922PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005923 Py_ssize_t size,
5924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005927 Py_ssize_t startinpos;
5928 Py_ssize_t endinpos;
5929 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005930 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 const char *end;
5932 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933 PyObject *errorHandler = NULL;
5934 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005935
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 /* Escaped strings will always be longer than the resulting
5937 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 length after conversion to the true value. (But decoding error
5939 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005940 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005944 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 end = s + size;
5947 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 unsigned char c;
5949 Py_UCS4 x;
5950 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005951 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 /* Non-escape characters are interpreted as Unicode ordinals */
5954 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005955 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5956 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 startinpos = s-starts;
5960
5961 /* \u-escapes are only interpreted iff the number of leading
5962 backslashes if odd */
5963 bs = s;
5964 for (;s < end;) {
5965 if (*s != '\\')
5966 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005967 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5968 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 }
5970 if (((s - bs) & 1) == 0 ||
5971 s >= end ||
5972 (*s != 'u' && *s != 'U')) {
5973 continue;
5974 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005975 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 count = *s=='u' ? 4 : 8;
5977 s++;
5978
5979 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 for (x = 0, i = 0; i < count; ++i, ++s) {
5981 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005982 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 endinpos = s-starts;
5984 if (unicode_decode_call_errorhandler(
5985 errors, &errorHandler,
5986 "rawunicodeescape", "truncated \\uXXXX",
5987 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005988 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 goto onError;
5990 goto nextByte;
5991 }
5992 x = (x<<4) & ~0xF;
5993 if (c >= '0' && c <= '9')
5994 x += c - '0';
5995 else if (c >= 'a' && c <= 'f')
5996 x += 10 + c - 'a';
5997 else
5998 x += 10 + c - 'A';
5999 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006000 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006001 if (unicode_putchar(&v, &outpos, x) < 0)
6002 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006003 } else {
6004 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006005 if (unicode_decode_call_errorhandler(
6006 errors, &errorHandler,
6007 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006009 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006011 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 nextByte:
6013 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006015 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 Py_XDECREF(errorHandler);
6018 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006019 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006020
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 return NULL;
6026}
6027
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028
Alexander Belopolsky40018472011-02-26 01:02:56 +00006029PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006032 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 char *p;
6034 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006035 Py_ssize_t expandsize, pos;
6036 int kind;
6037 void *data;
6038 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 if (!PyUnicode_Check(unicode)) {
6041 PyErr_BadArgument();
6042 return NULL;
6043 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006044 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 return NULL;
6046 kind = PyUnicode_KIND(unicode);
6047 data = PyUnicode_DATA(unicode);
6048 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006049 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6050 bytes, and 1 byte characters 4. */
6051 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006052
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006053 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006055
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 if (repr == NULL)
6058 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006059 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006060 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006062 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 for (pos = 0; pos < len; pos++) {
6064 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 /* Map 32-bit characters to '\Uxxxxxxxx' */
6066 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006067 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006068 *p++ = '\\';
6069 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006070 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6071 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6072 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6073 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6074 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6075 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6076 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6077 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006078 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006080 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 *p++ = '\\';
6082 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006083 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6084 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6085 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6086 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 /* Copy everything else as-is */
6089 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 *p++ = (char) ch;
6091 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006092
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006093 assert(p > q);
6094 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006095 return NULL;
6096 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097}
6098
Alexander Belopolsky40018472011-02-26 01:02:56 +00006099PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6101 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006103 PyObject *result;
6104 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6105 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006106 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6108 Py_DECREF(tmp);
6109 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110}
6111
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006112/* --- Unicode Internal Codec ------------------------------------------- */
6113
Alexander Belopolsky40018472011-02-26 01:02:56 +00006114PyObject *
6115_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006116 Py_ssize_t size,
6117 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006118{
6119 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006120 Py_ssize_t startinpos;
6121 Py_ssize_t endinpos;
6122 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006123 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006124 const char *end;
6125 const char *reason;
6126 PyObject *errorHandler = NULL;
6127 PyObject *exc = NULL;
6128
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006129 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006130 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006131 1))
6132 return NULL;
6133
Thomas Wouters89f507f2006-12-13 04:49:30 +00006134 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006135 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006136 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006139 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006140 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006141 end = s + size;
6142
6143 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006144 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006145 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006146 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006147 endinpos = end-starts;
6148 reason = "truncated input";
6149 goto error;
6150 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006151 /* We copy the raw representation one byte at a time because the
6152 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006153 ((char *) &uch)[0] = s[0];
6154 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006155#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006156 ((char *) &uch)[2] = s[2];
6157 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006158#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006159 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006160#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006161 /* We have to sanity check the raw data, otherwise doom looms for
6162 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006163 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006164 endinpos = s - starts + Py_UNICODE_SIZE;
6165 reason = "illegal code point (> 0x10FFFF)";
6166 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006167 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006168#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006169 s += Py_UNICODE_SIZE;
6170#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006171 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006172 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006173 Py_UNICODE uch2;
6174 ((char *) &uch2)[0] = s[0];
6175 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006176 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006177 {
Victor Stinner551ac952011-11-29 22:58:13 +01006178 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006179 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006180 }
6181 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006182#endif
6183
6184 if (unicode_putchar(&v, &outpos, ch) < 0)
6185 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006186 continue;
6187
6188 error:
6189 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006190 if (unicode_decode_call_errorhandler(
6191 errors, &errorHandler,
6192 "unicode_internal", reason,
6193 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006194 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006195 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006196 }
6197
Victor Stinner16e6a802011-12-12 13:24:15 +01006198 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006199 goto onError;
6200 Py_XDECREF(errorHandler);
6201 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006202 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006205 Py_XDECREF(v);
6206 Py_XDECREF(errorHandler);
6207 Py_XDECREF(exc);
6208 return NULL;
6209}
6210
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211/* --- Latin-1 Codec ------------------------------------------------------ */
6212
Alexander Belopolsky40018472011-02-26 01:02:56 +00006213PyObject *
6214PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006215 Py_ssize_t size,
6216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006219 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220}
6221
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223static void
6224make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006225 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006226 PyObject *unicode,
6227 Py_ssize_t startpos, Py_ssize_t endpos,
6228 const char *reason)
6229{
6230 if (*exceptionObject == NULL) {
6231 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006232 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006233 encoding, unicode, startpos, endpos, reason);
6234 }
6235 else {
6236 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6237 goto onError;
6238 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6239 goto onError;
6240 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6241 goto onError;
6242 return;
6243 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006244 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006245 }
6246}
6247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249static void
6250raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006252 PyObject *unicode,
6253 Py_ssize_t startpos, Py_ssize_t endpos,
6254 const char *reason)
6255{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006256 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006257 encoding, unicode, startpos, endpos, reason);
6258 if (*exceptionObject != NULL)
6259 PyCodec_StrictErrors(*exceptionObject);
6260}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261
6262/* error handling callback helper:
6263 build arguments, call the callback and check the arguments,
6264 put the result into newpos and return the replacement string, which
6265 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266static PyObject *
6267unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006268 PyObject **errorHandler,
6269 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006270 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006271 Py_ssize_t startpos, Py_ssize_t endpos,
6272 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006273{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006274 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006276 PyObject *restuple;
6277 PyObject *resunicode;
6278
6279 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 }
6284
Benjamin Petersonbac79492012-01-14 13:34:47 -05006285 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006286 return NULL;
6287 len = PyUnicode_GET_LENGTH(unicode);
6288
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006289 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006290 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293
6294 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006299 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 Py_DECREF(restuple);
6301 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006302 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006303 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 &resunicode, newpos)) {
6305 Py_DECREF(restuple);
6306 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006308 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6309 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6310 Py_DECREF(restuple);
6311 return NULL;
6312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006314 *newpos = len + *newpos;
6315 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6317 Py_DECREF(restuple);
6318 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006320 Py_INCREF(resunicode);
6321 Py_DECREF(restuple);
6322 return resunicode;
6323}
6324
Alexander Belopolsky40018472011-02-26 01:02:56 +00006325static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006326unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006327 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006328 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006329{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 /* input state */
6331 Py_ssize_t pos=0, size;
6332 int kind;
6333 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 /* output object */
6335 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 /* pointer into the output */
6337 char *str;
6338 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006339 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006340 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6341 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 PyObject *errorHandler = NULL;
6343 PyObject *exc = NULL;
6344 /* the following variable is used for caching string comparisons
6345 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6346 int known_errorHandler = -1;
6347
Benjamin Petersonbac79492012-01-14 13:34:47 -05006348 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006349 return NULL;
6350 size = PyUnicode_GET_LENGTH(unicode);
6351 kind = PyUnicode_KIND(unicode);
6352 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 /* allocate enough for a simple encoding without
6354 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006355 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006356 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006357 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006359 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006360 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 ressize = size;
6362
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006363 while (pos < size) {
6364 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 /* can we encode this? */
6367 if (c<limit) {
6368 /* no overflow check, because we know that the space is enough */
6369 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006370 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006371 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 Py_ssize_t requiredsize;
6374 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006375 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 Py_ssize_t collstart = pos;
6378 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006380 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 ++collend;
6382 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6383 if (known_errorHandler==-1) {
6384 if ((errors==NULL) || (!strcmp(errors, "strict")))
6385 known_errorHandler = 1;
6386 else if (!strcmp(errors, "replace"))
6387 known_errorHandler = 2;
6388 else if (!strcmp(errors, "ignore"))
6389 known_errorHandler = 3;
6390 else if (!strcmp(errors, "xmlcharrefreplace"))
6391 known_errorHandler = 4;
6392 else
6393 known_errorHandler = 0;
6394 }
6395 switch (known_errorHandler) {
6396 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006397 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 goto onError;
6399 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006400 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 *str++ = '?'; /* fall through */
6402 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006403 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 break;
6405 case 4: /* xmlcharrefreplace */
6406 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006407 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006409 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006410 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006411 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006413 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006415 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006417 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006419 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006421 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006422 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006423 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006424 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006425 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006426 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006427 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006428 if (requiredsize > PY_SSIZE_T_MAX - incr)
6429 goto overflow;
6430 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006432 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6433 goto overflow;
6434 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006436 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 requiredsize = 2*ressize;
6438 if (_PyBytes_Resize(&res, requiredsize))
6439 goto onError;
6440 str = PyBytes_AS_STRING(res) + respos;
6441 ressize = requiredsize;
6442 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006443 /* generate replacement */
6444 for (i = collstart; i < collend; ++i) {
6445 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 break;
6449 default:
6450 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 encoding, reason, unicode, &exc,
6452 collstart, collend, &newpos);
6453 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006454 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006456 if (PyBytes_Check(repunicode)) {
6457 /* Directly copy bytes result to output. */
6458 repsize = PyBytes_Size(repunicode);
6459 if (repsize > 1) {
6460 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006461 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006462 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6463 Py_DECREF(repunicode);
6464 goto overflow;
6465 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006466 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6467 Py_DECREF(repunicode);
6468 goto onError;
6469 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006470 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006471 ressize += repsize-1;
6472 }
6473 memcpy(str, PyBytes_AsString(repunicode), repsize);
6474 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006476 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006477 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006478 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 /* need more space? (at least enough for what we
6480 have+the replacement+the rest of the string, so
6481 we won't have to check space for encodable characters) */
6482 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006483 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 requiredsize = respos;
6485 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6486 goto overflow;
6487 requiredsize += repsize;
6488 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6489 goto overflow;
6490 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 requiredsize = 2*ressize;
6494 if (_PyBytes_Resize(&res, requiredsize)) {
6495 Py_DECREF(repunicode);
6496 goto onError;
6497 }
6498 str = PyBytes_AS_STRING(res) + respos;
6499 ressize = requiredsize;
6500 }
6501 /* check if there is anything unencodable in the replacement
6502 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 for (i = 0; repsize-->0; ++i, ++str) {
6504 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006506 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 Py_DECREF(repunicode);
6509 goto onError;
6510 }
6511 *str = (char)c;
6512 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006514 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006516 }
6517 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006518 /* Resize if we allocated to much */
6519 size = str - PyBytes_AS_STRING(res);
6520 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006521 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006522 if (_PyBytes_Resize(&res, size) < 0)
6523 goto onError;
6524 }
6525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 Py_XDECREF(errorHandler);
6527 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006528 return res;
6529
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006530 overflow:
6531 PyErr_SetString(PyExc_OverflowError,
6532 "encoded result is too long for a Python string");
6533
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006534 onError:
6535 Py_XDECREF(res);
6536 Py_XDECREF(errorHandler);
6537 Py_XDECREF(exc);
6538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539}
6540
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006542PyObject *
6543PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006544 Py_ssize_t size,
6545 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 PyObject *result;
6548 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6549 if (unicode == NULL)
6550 return NULL;
6551 result = unicode_encode_ucs1(unicode, errors, 256);
6552 Py_DECREF(unicode);
6553 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554}
6555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006557_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558{
6559 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 PyErr_BadArgument();
6561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006563 if (PyUnicode_READY(unicode) == -1)
6564 return NULL;
6565 /* Fast path: if it is a one-byte string, construct
6566 bytes object directly. */
6567 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6568 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6569 PyUnicode_GET_LENGTH(unicode));
6570 /* Non-Latin-1 characters present. Defer to above function to
6571 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006573}
6574
6575PyObject*
6576PyUnicode_AsLatin1String(PyObject *unicode)
6577{
6578 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579}
6580
6581/* --- 7-bit ASCII Codec -------------------------------------------------- */
6582
Alexander Belopolsky40018472011-02-26 01:02:56 +00006583PyObject *
6584PyUnicode_DecodeASCII(const char *s,
6585 Py_ssize_t size,
6586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006589 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006590 int kind;
6591 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006592 Py_ssize_t startinpos;
6593 Py_ssize_t endinpos;
6594 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 const char *e;
6596 PyObject *errorHandler = NULL;
6597 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006600 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006603 if (size == 1 && (unsigned char)s[0] < 128)
6604 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006605
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006606 unicode = PyUnicode_New(size, 127);
6607 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006611 data = PyUnicode_1BYTE_DATA(unicode);
6612 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6613 if (outpos == size)
6614 return unicode;
6615
6616 s += outpos;
6617 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 register unsigned char c = (unsigned char)*s;
6620 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006621 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 ++s;
6623 }
6624 else {
6625 startinpos = s-starts;
6626 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 if (unicode_decode_call_errorhandler(
6628 errors, &errorHandler,
6629 "ascii", "ordinal not in range(128)",
6630 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006631 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006633 kind = PyUnicode_KIND(unicode);
6634 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006637 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006638 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 Py_XDECREF(errorHandler);
6640 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006641 assert(_PyUnicode_CheckConsistency(unicode, 1));
6642 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006643
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006645 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 Py_XDECREF(errorHandler);
6647 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 return NULL;
6649}
6650
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006651/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652PyObject *
6653PyUnicode_EncodeASCII(const Py_UNICODE *p,
6654 Py_ssize_t size,
6655 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 PyObject *result;
6658 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6659 if (unicode == NULL)
6660 return NULL;
6661 result = unicode_encode_ucs1(unicode, errors, 128);
6662 Py_DECREF(unicode);
6663 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664}
6665
Alexander Belopolsky40018472011-02-26 01:02:56 +00006666PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006667_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668{
6669 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 PyErr_BadArgument();
6671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006673 if (PyUnicode_READY(unicode) == -1)
6674 return NULL;
6675 /* Fast path: if it is an ASCII-only string, construct bytes object
6676 directly. Else defer to above function to raise the exception. */
6677 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6678 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6679 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006680 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006681}
6682
6683PyObject *
6684PyUnicode_AsASCIIString(PyObject *unicode)
6685{
6686 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687}
6688
Victor Stinner99b95382011-07-04 14:23:54 +02006689#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006690
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006691/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006692
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006693#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694#define NEED_RETRY
6695#endif
6696
Victor Stinner3a50e702011-10-18 21:21:00 +02006697#ifndef WC_ERR_INVALID_CHARS
6698# define WC_ERR_INVALID_CHARS 0x0080
6699#endif
6700
6701static char*
6702code_page_name(UINT code_page, PyObject **obj)
6703{
6704 *obj = NULL;
6705 if (code_page == CP_ACP)
6706 return "mbcs";
6707 if (code_page == CP_UTF7)
6708 return "CP_UTF7";
6709 if (code_page == CP_UTF8)
6710 return "CP_UTF8";
6711
6712 *obj = PyBytes_FromFormat("cp%u", code_page);
6713 if (*obj == NULL)
6714 return NULL;
6715 return PyBytes_AS_STRING(*obj);
6716}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717
Alexander Belopolsky40018472011-02-26 01:02:56 +00006718static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006719is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720{
6721 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006722 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006723
Victor Stinner3a50e702011-10-18 21:21:00 +02006724 if (!IsDBCSLeadByteEx(code_page, *curr))
6725 return 0;
6726
6727 prev = CharPrevExA(code_page, s, curr, 0);
6728 if (prev == curr)
6729 return 1;
6730 /* FIXME: This code is limited to "true" double-byte encodings,
6731 as it assumes an incomplete character consists of a single
6732 byte. */
6733 if (curr - prev == 2)
6734 return 1;
6735 if (!IsDBCSLeadByteEx(code_page, *prev))
6736 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006737 return 0;
6738}
6739
Victor Stinner3a50e702011-10-18 21:21:00 +02006740static DWORD
6741decode_code_page_flags(UINT code_page)
6742{
6743 if (code_page == CP_UTF7) {
6744 /* The CP_UTF7 decoder only supports flags=0 */
6745 return 0;
6746 }
6747 else
6748 return MB_ERR_INVALID_CHARS;
6749}
6750
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006751/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006752 * Decode a byte string from a Windows code page into unicode object in strict
6753 * mode.
6754 *
6755 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6756 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006758static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006759decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006760 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006761 const char *in,
6762 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763{
Victor Stinner3a50e702011-10-18 21:21:00 +02006764 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006765 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006766 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767
6768 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006769 assert(insize > 0);
6770 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6771 if (outsize <= 0)
6772 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006773
6774 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006776 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006777 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 if (*v == NULL)
6779 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006780 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781 }
6782 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006784 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006785 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006787 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006788 }
6789
6790 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006791 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6792 if (outsize <= 0)
6793 goto error;
6794 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006795
Victor Stinner3a50e702011-10-18 21:21:00 +02006796error:
6797 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6798 return -2;
6799 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006800 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006801}
6802
Victor Stinner3a50e702011-10-18 21:21:00 +02006803/*
6804 * Decode a byte string from a code page into unicode object with an error
6805 * handler.
6806 *
6807 * Returns consumed size if succeed, or raise a WindowsError or
6808 * UnicodeDecodeError exception and returns -1 on error.
6809 */
6810static int
6811decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006812 PyObject **v,
6813 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006814 const char *errors)
6815{
6816 const char *startin = in;
6817 const char *endin = in + size;
6818 const DWORD flags = decode_code_page_flags(code_page);
6819 /* Ideally, we should get reason from FormatMessage. This is the Windows
6820 2000 English version of the message. */
6821 const char *reason = "No mapping for the Unicode character exists "
6822 "in the target code page.";
6823 /* each step cannot decode more than 1 character, but a character can be
6824 represented as a surrogate pair */
6825 wchar_t buffer[2], *startout, *out;
6826 int insize, outsize;
6827 PyObject *errorHandler = NULL;
6828 PyObject *exc = NULL;
6829 PyObject *encoding_obj = NULL;
6830 char *encoding;
6831 DWORD err;
6832 int ret = -1;
6833
6834 assert(size > 0);
6835
6836 encoding = code_page_name(code_page, &encoding_obj);
6837 if (encoding == NULL)
6838 return -1;
6839
6840 if (errors == NULL || strcmp(errors, "strict") == 0) {
6841 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6842 UnicodeDecodeError. */
6843 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6844 if (exc != NULL) {
6845 PyCodec_StrictErrors(exc);
6846 Py_CLEAR(exc);
6847 }
6848 goto error;
6849 }
6850
6851 if (*v == NULL) {
6852 /* Create unicode object */
6853 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6854 PyErr_NoMemory();
6855 goto error;
6856 }
Victor Stinnerab595942011-12-17 04:59:06 +01006857 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006858 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 if (*v == NULL)
6860 goto error;
6861 startout = PyUnicode_AS_UNICODE(*v);
6862 }
6863 else {
6864 /* Extend unicode object */
6865 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6866 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6867 PyErr_NoMemory();
6868 goto error;
6869 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006870 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 goto error;
6872 startout = PyUnicode_AS_UNICODE(*v) + n;
6873 }
6874
6875 /* Decode the byte string character per character */
6876 out = startout;
6877 while (in < endin)
6878 {
6879 /* Decode a character */
6880 insize = 1;
6881 do
6882 {
6883 outsize = MultiByteToWideChar(code_page, flags,
6884 in, insize,
6885 buffer, Py_ARRAY_LENGTH(buffer));
6886 if (outsize > 0)
6887 break;
6888 err = GetLastError();
6889 if (err != ERROR_NO_UNICODE_TRANSLATION
6890 && err != ERROR_INSUFFICIENT_BUFFER)
6891 {
6892 PyErr_SetFromWindowsErr(0);
6893 goto error;
6894 }
6895 insize++;
6896 }
6897 /* 4=maximum length of a UTF-8 sequence */
6898 while (insize <= 4 && (in + insize) <= endin);
6899
6900 if (outsize <= 0) {
6901 Py_ssize_t startinpos, endinpos, outpos;
6902
6903 startinpos = in - startin;
6904 endinpos = startinpos + 1;
6905 outpos = out - PyUnicode_AS_UNICODE(*v);
6906 if (unicode_decode_call_errorhandler(
6907 errors, &errorHandler,
6908 encoding, reason,
6909 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006910 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 {
6912 goto error;
6913 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006914 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 }
6916 else {
6917 in += insize;
6918 memcpy(out, buffer, outsize * sizeof(wchar_t));
6919 out += outsize;
6920 }
6921 }
6922
6923 /* write a NUL character at the end */
6924 *out = 0;
6925
6926 /* Extend unicode object */
6927 outsize = out - startout;
6928 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006929 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006931 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006932
6933error:
6934 Py_XDECREF(encoding_obj);
6935 Py_XDECREF(errorHandler);
6936 Py_XDECREF(exc);
6937 return ret;
6938}
6939
Victor Stinner3a50e702011-10-18 21:21:00 +02006940static PyObject *
6941decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006942 const char *s, Py_ssize_t size,
6943 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944{
Victor Stinner76a31a62011-11-04 00:05:13 +01006945 PyObject *v = NULL;
6946 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 if (code_page < 0) {
6949 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6950 return NULL;
6951 }
6952
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006955
Victor Stinner76a31a62011-11-04 00:05:13 +01006956 do
6957 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006959 if (size > INT_MAX) {
6960 chunk_size = INT_MAX;
6961 final = 0;
6962 done = 0;
6963 }
6964 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006965#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006966 {
6967 chunk_size = (int)size;
6968 final = (consumed == NULL);
6969 done = 1;
6970 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006971
Victor Stinner76a31a62011-11-04 00:05:13 +01006972 /* Skip trailing lead-byte unless 'final' is set */
6973 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6974 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006975
Victor Stinner76a31a62011-11-04 00:05:13 +01006976 if (chunk_size == 0 && done) {
6977 if (v != NULL)
6978 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006979 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006980 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981
Victor Stinner76a31a62011-11-04 00:05:13 +01006982
6983 converted = decode_code_page_strict(code_page, &v,
6984 s, chunk_size);
6985 if (converted == -2)
6986 converted = decode_code_page_errors(code_page, &v,
6987 s, chunk_size,
6988 errors);
6989 assert(converted != 0);
6990
6991 if (converted < 0) {
6992 Py_XDECREF(v);
6993 return NULL;
6994 }
6995
6996 if (consumed)
6997 *consumed += converted;
6998
6999 s += converted;
7000 size -= converted;
7001 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007002
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007003 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007004}
7005
Alexander Belopolsky40018472011-02-26 01:02:56 +00007006PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007007PyUnicode_DecodeCodePageStateful(int code_page,
7008 const char *s,
7009 Py_ssize_t size,
7010 const char *errors,
7011 Py_ssize_t *consumed)
7012{
7013 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7014}
7015
7016PyObject *
7017PyUnicode_DecodeMBCSStateful(const char *s,
7018 Py_ssize_t size,
7019 const char *errors,
7020 Py_ssize_t *consumed)
7021{
7022 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7023}
7024
7025PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007026PyUnicode_DecodeMBCS(const char *s,
7027 Py_ssize_t size,
7028 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007029{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7031}
7032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033static DWORD
7034encode_code_page_flags(UINT code_page, const char *errors)
7035{
7036 if (code_page == CP_UTF8) {
7037 if (winver.dwMajorVersion >= 6)
7038 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7039 and later */
7040 return WC_ERR_INVALID_CHARS;
7041 else
7042 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7043 return 0;
7044 }
7045 else if (code_page == CP_UTF7) {
7046 /* CP_UTF7 only supports flags=0 */
7047 return 0;
7048 }
7049 else {
7050 if (errors != NULL && strcmp(errors, "replace") == 0)
7051 return 0;
7052 else
7053 return WC_NO_BEST_FIT_CHARS;
7054 }
7055}
7056
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 * Encode a Unicode string to a Windows code page into a byte string in strict
7059 * mode.
7060 *
7061 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7062 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007063 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007064static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007065encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007066 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068{
Victor Stinner554f3f02010-06-16 23:33:54 +00007069 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 BOOL *pusedDefaultChar = &usedDefaultChar;
7071 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007072 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007073 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 const DWORD flags = encode_code_page_flags(code_page, NULL);
7076 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007077 /* Create a substring so that we can get the UTF-16 representation
7078 of just the slice under consideration. */
7079 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
Martin v. Löwis3d325192011-11-04 18:23:06 +01007081 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007082
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007084 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007086 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007087
Victor Stinner2fc507f2011-11-04 20:06:39 +01007088 substring = PyUnicode_Substring(unicode, offset, offset+len);
7089 if (substring == NULL)
7090 return -1;
7091 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7092 if (p == NULL) {
7093 Py_DECREF(substring);
7094 return -1;
7095 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007096
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007097 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 outsize = WideCharToMultiByte(code_page, flags,
7099 p, size,
7100 NULL, 0,
7101 NULL, pusedDefaultChar);
7102 if (outsize <= 0)
7103 goto error;
7104 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007105 if (pusedDefaultChar && *pusedDefaultChar) {
7106 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007108 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007113 if (*outbytes == NULL) {
7114 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007116 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118 }
7119 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const Py_ssize_t n = PyBytes_Size(*outbytes);
7122 if (outsize > PY_SSIZE_T_MAX - n) {
7123 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007124 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007127 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7128 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007130 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132 }
7133
7134 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 outsize = WideCharToMultiByte(code_page, flags,
7136 p, size,
7137 out, outsize,
7138 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007139 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 if (outsize <= 0)
7141 goto error;
7142 if (pusedDefaultChar && *pusedDefaultChar)
7143 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007145
Victor Stinner3a50e702011-10-18 21:21:00 +02007146error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007147 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149 return -2;
7150 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007151 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007152}
7153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154/*
7155 * Encode a Unicode string to a Windows code page into a byte string using a
7156 * error handler.
7157 *
7158 * Returns consumed characters if succeed, or raise a WindowsError and returns
7159 * -1 on other error.
7160 */
7161static int
7162encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007163 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007164 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007165{
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007167 Py_ssize_t pos = unicode_offset;
7168 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 /* Ideally, we should get reason from FormatMessage. This is the Windows
7170 2000 English version of the message. */
7171 const char *reason = "invalid character";
7172 /* 4=maximum length of a UTF-8 sequence */
7173 char buffer[4];
7174 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7175 Py_ssize_t outsize;
7176 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 PyObject *errorHandler = NULL;
7178 PyObject *exc = NULL;
7179 PyObject *encoding_obj = NULL;
7180 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007181 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 PyObject *rep;
7183 int ret = -1;
7184
7185 assert(insize > 0);
7186
7187 encoding = code_page_name(code_page, &encoding_obj);
7188 if (encoding == NULL)
7189 return -1;
7190
7191 if (errors == NULL || strcmp(errors, "strict") == 0) {
7192 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7193 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007194 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 if (exc != NULL) {
7196 PyCodec_StrictErrors(exc);
7197 Py_DECREF(exc);
7198 }
7199 Py_XDECREF(encoding_obj);
7200 return -1;
7201 }
7202
7203 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7204 pusedDefaultChar = &usedDefaultChar;
7205 else
7206 pusedDefaultChar = NULL;
7207
7208 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7209 PyErr_NoMemory();
7210 goto error;
7211 }
7212 outsize = insize * Py_ARRAY_LENGTH(buffer);
7213
7214 if (*outbytes == NULL) {
7215 /* Create string object */
7216 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7217 if (*outbytes == NULL)
7218 goto error;
7219 out = PyBytes_AS_STRING(*outbytes);
7220 }
7221 else {
7222 /* Extend string object */
7223 Py_ssize_t n = PyBytes_Size(*outbytes);
7224 if (n > PY_SSIZE_T_MAX - outsize) {
7225 PyErr_NoMemory();
7226 goto error;
7227 }
7228 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7229 goto error;
7230 out = PyBytes_AS_STRING(*outbytes) + n;
7231 }
7232
7233 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007234 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007236 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7237 wchar_t chars[2];
7238 int charsize;
7239 if (ch < 0x10000) {
7240 chars[0] = (wchar_t)ch;
7241 charsize = 1;
7242 }
7243 else {
7244 ch -= 0x10000;
7245 chars[0] = 0xd800 + (ch >> 10);
7246 chars[1] = 0xdc00 + (ch & 0x3ff);
7247 charsize = 2;
7248 }
7249
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007251 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 buffer, Py_ARRAY_LENGTH(buffer),
7253 NULL, pusedDefaultChar);
7254 if (outsize > 0) {
7255 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7256 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007257 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 memcpy(out, buffer, outsize);
7259 out += outsize;
7260 continue;
7261 }
7262 }
7263 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7264 PyErr_SetFromWindowsErr(0);
7265 goto error;
7266 }
7267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 rep = unicode_encode_call_errorhandler(
7269 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007270 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007271 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 if (rep == NULL)
7273 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007274 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275
7276 if (PyBytes_Check(rep)) {
7277 outsize = PyBytes_GET_SIZE(rep);
7278 if (outsize != 1) {
7279 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7280 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7281 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7282 Py_DECREF(rep);
7283 goto error;
7284 }
7285 out = PyBytes_AS_STRING(*outbytes) + offset;
7286 }
7287 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7288 out += outsize;
7289 }
7290 else {
7291 Py_ssize_t i;
7292 enum PyUnicode_Kind kind;
7293 void *data;
7294
Benjamin Petersonbac79492012-01-14 13:34:47 -05007295 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 Py_DECREF(rep);
7297 goto error;
7298 }
7299
7300 outsize = PyUnicode_GET_LENGTH(rep);
7301 if (outsize != 1) {
7302 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7303 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7304 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7305 Py_DECREF(rep);
7306 goto error;
7307 }
7308 out = PyBytes_AS_STRING(*outbytes) + offset;
7309 }
7310 kind = PyUnicode_KIND(rep);
7311 data = PyUnicode_DATA(rep);
7312 for (i=0; i < outsize; i++) {
7313 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7314 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007315 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007316 encoding, unicode,
7317 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 "unable to encode error handler result to ASCII");
7319 Py_DECREF(rep);
7320 goto error;
7321 }
7322 *out = (unsigned char)ch;
7323 out++;
7324 }
7325 }
7326 Py_DECREF(rep);
7327 }
7328 /* write a NUL byte */
7329 *out = 0;
7330 outsize = out - PyBytes_AS_STRING(*outbytes);
7331 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7332 if (_PyBytes_Resize(outbytes, outsize) < 0)
7333 goto error;
7334 ret = 0;
7335
7336error:
7337 Py_XDECREF(encoding_obj);
7338 Py_XDECREF(errorHandler);
7339 Py_XDECREF(exc);
7340 return ret;
7341}
7342
Victor Stinner3a50e702011-10-18 21:21:00 +02007343static PyObject *
7344encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007345 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 const char *errors)
7347{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007348 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007351 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007352
Benjamin Petersonbac79492012-01-14 13:34:47 -05007353 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007354 return NULL;
7355 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 if (code_page < 0) {
7358 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7359 return NULL;
7360 }
7361
Martin v. Löwis3d325192011-11-04 18:23:06 +01007362 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007363 return PyBytes_FromStringAndSize(NULL, 0);
7364
Victor Stinner7581cef2011-11-03 22:32:33 +01007365 offset = 0;
7366 do
7367 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007369 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007370 chunks. */
7371 if (len > INT_MAX/2) {
7372 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 done = 0;
7374 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007375 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007376#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007378 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007379 done = 1;
7380 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007381
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007383 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007384 errors);
7385 if (ret == -2)
7386 ret = encode_code_page_errors(code_page, &outbytes,
7387 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007388 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007389 if (ret < 0) {
7390 Py_XDECREF(outbytes);
7391 return NULL;
7392 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007393
Victor Stinner7581cef2011-11-03 22:32:33 +01007394 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007395 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007396 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007397
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 return outbytes;
7399}
7400
7401PyObject *
7402PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7403 Py_ssize_t size,
7404 const char *errors)
7405{
Victor Stinner7581cef2011-11-03 22:32:33 +01007406 PyObject *unicode, *res;
7407 unicode = PyUnicode_FromUnicode(p, size);
7408 if (unicode == NULL)
7409 return NULL;
7410 res = encode_code_page(CP_ACP, unicode, errors);
7411 Py_DECREF(unicode);
7412 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413}
7414
7415PyObject *
7416PyUnicode_EncodeCodePage(int code_page,
7417 PyObject *unicode,
7418 const char *errors)
7419{
Victor Stinner7581cef2011-11-03 22:32:33 +01007420 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007421}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007422
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423PyObject *
7424PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007425{
7426 if (!PyUnicode_Check(unicode)) {
7427 PyErr_BadArgument();
7428 return NULL;
7429 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007431}
7432
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433#undef NEED_RETRY
7434
Victor Stinner99b95382011-07-04 14:23:54 +02007435#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007436
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437/* --- Character Mapping Codec -------------------------------------------- */
7438
Alexander Belopolsky40018472011-02-26 01:02:56 +00007439PyObject *
7440PyUnicode_DecodeCharmap(const char *s,
7441 Py_ssize_t size,
7442 PyObject *mapping,
7443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007446 Py_ssize_t startinpos;
7447 Py_ssize_t endinpos;
7448 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007450 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007451 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007452 PyObject *errorHandler = NULL;
7453 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007454
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455 /* Default to Latin-1 */
7456 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007459 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007463 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007464 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007465 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007466 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007467 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007468 enum PyUnicode_Kind mapkind;
7469 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007470 Py_UCS4 x;
7471
Benjamin Petersonbac79492012-01-14 13:34:47 -05007472 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007473 return NULL;
7474
7475 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007476 mapdata = PyUnicode_DATA(mapping);
7477 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007479 unsigned char ch;
7480 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7481 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7482 if (outkind == PyUnicode_1BYTE_KIND) {
7483 void *outdata = PyUnicode_DATA(v);
7484 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7485 while (s < e) {
7486 unsigned char ch = *s;
7487 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7488 if (x > maxchar)
7489 goto Error;
7490 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7491 ++s;
7492 }
7493 break;
7494 }
7495 else if (outkind == PyUnicode_2BYTE_KIND) {
7496 void *outdata = PyUnicode_DATA(v);
7497 while (s < e) {
7498 unsigned char ch = *s;
7499 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7500 if (x == 0xFFFE)
7501 goto Error;
7502 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7503 ++s;
7504 }
7505 break;
7506 }
7507 }
7508 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007511 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007512 else
7513 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007514Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007515 if (x == 0xfffe)
7516 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 startinpos = s-starts;
7519 endinpos = startinpos+1;
7520 if (unicode_decode_call_errorhandler(
7521 errors, &errorHandler,
7522 "charmap", "character maps to <undefined>",
7523 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007524 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 goto onError;
7526 }
7527 continue;
7528 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007529
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007530 if (unicode_putchar(&v, &outpos, x) < 0)
7531 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007533 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007534 }
7535 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 while (s < e) {
7537 unsigned char ch = *s;
7538 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007539
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7541 w = PyLong_FromLong((long)ch);
7542 if (w == NULL)
7543 goto onError;
7544 x = PyObject_GetItem(mapping, w);
7545 Py_DECREF(w);
7546 if (x == NULL) {
7547 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7548 /* No mapping found means: mapping is undefined. */
7549 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007550 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 } else
7552 goto onError;
7553 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007554
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007556 if (x == Py_None)
7557 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 if (PyLong_Check(x)) {
7559 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007560 if (value == 0xFFFE)
7561 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007562 if (value < 0 || value > MAX_UNICODE) {
7563 PyErr_Format(PyExc_TypeError,
7564 "character mapping must be in range(0x%lx)",
7565 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 Py_DECREF(x);
7567 goto onError;
7568 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007569 if (unicode_putchar(&v, &outpos, value) < 0) {
7570 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007571 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007575 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007576
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007577 if (PyUnicode_READY(x) == -1) {
7578 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007579 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007580 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007581 targetsize = PyUnicode_GET_LENGTH(x);
7582
7583 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007585 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007586 if (value == 0xFFFE)
7587 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007588 if (unicode_putchar(&v, &outpos, value) < 0) {
7589 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007590 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007591 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 else if (targetsize > 1) {
7594 /* 1-n mapping */
7595 if (targetsize > extrachars) {
7596 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 Py_ssize_t needed = (targetsize - extrachars) + \
7598 (targetsize << 2);
7599 extrachars += needed;
7600 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007601 if (unicode_resize(&v,
7602 PyUnicode_GET_LENGTH(v) + needed) < 0)
7603 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 Py_DECREF(x);
7605 goto onError;
7606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007608 if (unicode_widen(&v, outpos,
7609 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7610 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007611 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007612 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007613 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7614 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 extrachars -= targetsize;
7616 }
7617 /* 1-0 mapping: skip the character */
7618 }
7619 else {
7620 /* wrong return value */
7621 PyErr_SetString(PyExc_TypeError,
7622 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007623 Py_DECREF(x);
7624 goto onError;
7625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 Py_DECREF(x);
7627 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007628 continue;
7629Undefined:
7630 /* undefined mapping */
7631 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007632 startinpos = s-starts;
7633 endinpos = startinpos+1;
7634 if (unicode_decode_call_errorhandler(
7635 errors, &errorHandler,
7636 "charmap", "character maps to <undefined>",
7637 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007638 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007639 goto onError;
7640 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007643 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007644 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 Py_XDECREF(errorHandler);
7646 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007647 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007648
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 Py_XDECREF(errorHandler);
7651 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 Py_XDECREF(v);
7653 return NULL;
7654}
7655
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007656/* Charmap encoding: the lookup table */
7657
Alexander Belopolsky40018472011-02-26 01:02:56 +00007658struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 PyObject_HEAD
7660 unsigned char level1[32];
7661 int count2, count3;
7662 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007663};
7664
7665static PyObject*
7666encoding_map_size(PyObject *obj, PyObject* args)
7667{
7668 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007671}
7672
7673static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007674 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 PyDoc_STR("Return the size (in bytes) of this object") },
7676 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007677};
7678
7679static void
7680encoding_map_dealloc(PyObject* o)
7681{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007682 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007683}
7684
7685static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 "EncodingMap", /*tp_name*/
7688 sizeof(struct encoding_map), /*tp_basicsize*/
7689 0, /*tp_itemsize*/
7690 /* methods */
7691 encoding_map_dealloc, /*tp_dealloc*/
7692 0, /*tp_print*/
7693 0, /*tp_getattr*/
7694 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007695 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 0, /*tp_repr*/
7697 0, /*tp_as_number*/
7698 0, /*tp_as_sequence*/
7699 0, /*tp_as_mapping*/
7700 0, /*tp_hash*/
7701 0, /*tp_call*/
7702 0, /*tp_str*/
7703 0, /*tp_getattro*/
7704 0, /*tp_setattro*/
7705 0, /*tp_as_buffer*/
7706 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7707 0, /*tp_doc*/
7708 0, /*tp_traverse*/
7709 0, /*tp_clear*/
7710 0, /*tp_richcompare*/
7711 0, /*tp_weaklistoffset*/
7712 0, /*tp_iter*/
7713 0, /*tp_iternext*/
7714 encoding_map_methods, /*tp_methods*/
7715 0, /*tp_members*/
7716 0, /*tp_getset*/
7717 0, /*tp_base*/
7718 0, /*tp_dict*/
7719 0, /*tp_descr_get*/
7720 0, /*tp_descr_set*/
7721 0, /*tp_dictoffset*/
7722 0, /*tp_init*/
7723 0, /*tp_alloc*/
7724 0, /*tp_new*/
7725 0, /*tp_free*/
7726 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007727};
7728
7729PyObject*
7730PyUnicode_BuildEncodingMap(PyObject* string)
7731{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732 PyObject *result;
7733 struct encoding_map *mresult;
7734 int i;
7735 int need_dict = 0;
7736 unsigned char level1[32];
7737 unsigned char level2[512];
7738 unsigned char *mlevel1, *mlevel2, *mlevel3;
7739 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007740 int kind;
7741 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007742 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007743 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007745 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746 PyErr_BadArgument();
7747 return NULL;
7748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007749 kind = PyUnicode_KIND(string);
7750 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007751 length = PyUnicode_GET_LENGTH(string);
7752 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007753 memset(level1, 0xFF, sizeof level1);
7754 memset(level2, 0xFF, sizeof level2);
7755
7756 /* If there isn't a one-to-one mapping of NULL to \0,
7757 or if there are non-BMP characters, we need to use
7758 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007759 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007761 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007763 ch = PyUnicode_READ(kind, data, i);
7764 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007765 need_dict = 1;
7766 break;
7767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007768 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 /* unmapped character */
7770 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 l1 = ch >> 11;
7772 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 if (level1[l1] == 0xFF)
7774 level1[l1] = count2++;
7775 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007776 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 }
7778
7779 if (count2 >= 0xFF || count3 >= 0xFF)
7780 need_dict = 1;
7781
7782 if (need_dict) {
7783 PyObject *result = PyDict_New();
7784 PyObject *key, *value;
7785 if (!result)
7786 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007787 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007789 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007790 if (!key || !value)
7791 goto failed1;
7792 if (PyDict_SetItem(result, key, value) == -1)
7793 goto failed1;
7794 Py_DECREF(key);
7795 Py_DECREF(value);
7796 }
7797 return result;
7798 failed1:
7799 Py_XDECREF(key);
7800 Py_XDECREF(value);
7801 Py_DECREF(result);
7802 return NULL;
7803 }
7804
7805 /* Create a three-level trie */
7806 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7807 16*count2 + 128*count3 - 1);
7808 if (!result)
7809 return PyErr_NoMemory();
7810 PyObject_Init(result, &EncodingMapType);
7811 mresult = (struct encoding_map*)result;
7812 mresult->count2 = count2;
7813 mresult->count3 = count3;
7814 mlevel1 = mresult->level1;
7815 mlevel2 = mresult->level23;
7816 mlevel3 = mresult->level23 + 16*count2;
7817 memcpy(mlevel1, level1, 32);
7818 memset(mlevel2, 0xFF, 16*count2);
7819 memset(mlevel3, 0, 128*count3);
7820 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007821 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007823 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7824 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825 /* unmapped character */
7826 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007827 o1 = ch>>11;
7828 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829 i2 = 16*mlevel1[o1] + o2;
7830 if (mlevel2[i2] == 0xFF)
7831 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007832 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007833 i3 = 128*mlevel2[i2] + o3;
7834 mlevel3[i3] = i;
7835 }
7836 return result;
7837}
7838
7839static int
Victor Stinner22168992011-11-20 17:09:18 +01007840encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841{
7842 struct encoding_map *map = (struct encoding_map*)mapping;
7843 int l1 = c>>11;
7844 int l2 = (c>>7) & 0xF;
7845 int l3 = c & 0x7F;
7846 int i;
7847
Victor Stinner22168992011-11-20 17:09:18 +01007848 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 if (c == 0)
7851 return 0;
7852 /* level 1*/
7853 i = map->level1[l1];
7854 if (i == 0xFF) {
7855 return -1;
7856 }
7857 /* level 2*/
7858 i = map->level23[16*i+l2];
7859 if (i == 0xFF) {
7860 return -1;
7861 }
7862 /* level 3 */
7863 i = map->level23[16*map->count2 + 128*i + l3];
7864 if (i == 0) {
7865 return -1;
7866 }
7867 return i;
7868}
7869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870/* Lookup the character ch in the mapping. If the character
7871 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007872 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007873static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007874charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875{
Christian Heimes217cfd12007-12-02 14:31:20 +00007876 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007877 PyObject *x;
7878
7879 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881 x = PyObject_GetItem(mapping, w);
7882 Py_DECREF(w);
7883 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7885 /* No mapping found means: mapping is undefined. */
7886 PyErr_Clear();
7887 x = Py_None;
7888 Py_INCREF(x);
7889 return x;
7890 } else
7891 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007893 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007895 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 long value = PyLong_AS_LONG(x);
7897 if (value < 0 || value > 255) {
7898 PyErr_SetString(PyExc_TypeError,
7899 "character mapping must be in range(256)");
7900 Py_DECREF(x);
7901 return NULL;
7902 }
7903 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007905 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 /* wrong return value */
7909 PyErr_Format(PyExc_TypeError,
7910 "character mapping must return integer, bytes or None, not %.400s",
7911 x->ob_type->tp_name);
7912 Py_DECREF(x);
7913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 }
7915}
7916
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007918charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007919{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7921 /* exponentially overallocate to minimize reallocations */
7922 if (requiredsize < 2*outsize)
7923 requiredsize = 2*outsize;
7924 if (_PyBytes_Resize(outobj, requiredsize))
7925 return -1;
7926 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007927}
7928
Benjamin Peterson14339b62009-01-31 16:36:08 +00007929typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007931} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007933 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 space is available. Return a new reference to the object that
7935 was put in the output buffer, or Py_None, if the mapping was undefined
7936 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007937 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007938static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007939charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007940 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942 PyObject *rep;
7943 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007944 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945
Christian Heimes90aa7642007-12-19 02:45:37 +00007946 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 if (res == -1)
7950 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 if (outsize<requiredsize)
7952 if (charmapencode_resize(outobj, outpos, requiredsize))
7953 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007954 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 outstart[(*outpos)++] = (char)res;
7956 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957 }
7958
7959 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007962 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 Py_DECREF(rep);
7964 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 if (PyLong_Check(rep)) {
7967 Py_ssize_t requiredsize = *outpos+1;
7968 if (outsize<requiredsize)
7969 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7970 Py_DECREF(rep);
7971 return enc_EXCEPTION;
7972 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007973 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 else {
7977 const char *repchars = PyBytes_AS_STRING(rep);
7978 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7979 Py_ssize_t requiredsize = *outpos+repsize;
7980 if (outsize<requiredsize)
7981 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7982 Py_DECREF(rep);
7983 return enc_EXCEPTION;
7984 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007985 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 memcpy(outstart + *outpos, repchars, repsize);
7987 *outpos += repsize;
7988 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990 Py_DECREF(rep);
7991 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007992}
7993
7994/* handle an error in PyUnicode_EncodeCharmap
7995 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007996static int
7997charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007998 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008000 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008001 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002{
8003 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008004 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008005 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008006 enum PyUnicode_Kind kind;
8007 void *data;
8008 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008010 Py_ssize_t collstartpos = *inpos;
8011 Py_ssize_t collendpos = *inpos+1;
8012 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 char *encoding = "charmap";
8014 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008016 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008017 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018
Benjamin Petersonbac79492012-01-14 13:34:47 -05008019 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008020 return -1;
8021 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022 /* find all unencodable characters */
8023 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008025 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008026 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008027 val = encoding_map_lookup(ch, mapping);
8028 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 break;
8030 ++collendpos;
8031 continue;
8032 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008033
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008034 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8035 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 if (rep==NULL)
8037 return -1;
8038 else if (rep!=Py_None) {
8039 Py_DECREF(rep);
8040 break;
8041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008042 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 }
8045 /* cache callback name lookup
8046 * (if not done yet, i.e. it's the first error) */
8047 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 if ((errors==NULL) || (!strcmp(errors, "strict")))
8049 *known_errorHandler = 1;
8050 else if (!strcmp(errors, "replace"))
8051 *known_errorHandler = 2;
8052 else if (!strcmp(errors, "ignore"))
8053 *known_errorHandler = 3;
8054 else if (!strcmp(errors, "xmlcharrefreplace"))
8055 *known_errorHandler = 4;
8056 else
8057 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008058 }
8059 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008061 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 return -1;
8063 case 2: /* replace */
8064 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 x = charmapencode_output('?', mapping, res, respos);
8066 if (x==enc_EXCEPTION) {
8067 return -1;
8068 }
8069 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008070 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 return -1;
8072 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 }
8074 /* fall through */
8075 case 3: /* ignore */
8076 *inpos = collendpos;
8077 break;
8078 case 4: /* xmlcharrefreplace */
8079 /* generate replacement (temporarily (mis)uses p) */
8080 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 char buffer[2+29+1+1];
8082 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008083 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 for (cp = buffer; *cp; ++cp) {
8085 x = charmapencode_output(*cp, mapping, res, respos);
8086 if (x==enc_EXCEPTION)
8087 return -1;
8088 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008089 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 return -1;
8091 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 }
8093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 *inpos = collendpos;
8095 break;
8096 default:
8097 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008098 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008102 if (PyBytes_Check(repunicode)) {
8103 /* Directly copy bytes result to output. */
8104 Py_ssize_t outsize = PyBytes_Size(*res);
8105 Py_ssize_t requiredsize;
8106 repsize = PyBytes_Size(repunicode);
8107 requiredsize = *respos + repsize;
8108 if (requiredsize > outsize)
8109 /* Make room for all additional bytes. */
8110 if (charmapencode_resize(res, respos, requiredsize)) {
8111 Py_DECREF(repunicode);
8112 return -1;
8113 }
8114 memcpy(PyBytes_AsString(*res) + *respos,
8115 PyBytes_AsString(repunicode), repsize);
8116 *respos += repsize;
8117 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008118 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008119 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008120 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008122 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008123 Py_DECREF(repunicode);
8124 return -1;
8125 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008126 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008127 data = PyUnicode_DATA(repunicode);
8128 kind = PyUnicode_KIND(repunicode);
8129 for (index = 0; index < repsize; index++) {
8130 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8131 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008133 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 return -1;
8135 }
8136 else if (x==enc_FAILED) {
8137 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008138 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 return -1;
8140 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008141 }
8142 *inpos = newpos;
8143 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 }
8145 return 0;
8146}
8147
Alexander Belopolsky40018472011-02-26 01:02:56 +00008148PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008149_PyUnicode_EncodeCharmap(PyObject *unicode,
8150 PyObject *mapping,
8151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 /* output object */
8154 PyObject *res = NULL;
8155 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008157 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160 PyObject *errorHandler = NULL;
8161 PyObject *exc = NULL;
8162 /* the following variable is used for caching string comparisons
8163 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8164 * 3=ignore, 4=xmlcharrefreplace */
8165 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
Benjamin Petersonbac79492012-01-14 13:34:47 -05008167 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008168 return NULL;
8169 size = PyUnicode_GET_LENGTH(unicode);
8170
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 /* Default to Latin-1 */
8172 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008173 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008175 /* allocate enough for a simple encoding without
8176 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008177 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 if (res == NULL)
8179 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008180 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008183 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008184 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008186 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 if (x==enc_EXCEPTION) /* error */
8188 goto onError;
8189 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008190 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 &exc,
8192 &known_errorHandler, &errorHandler, errors,
8193 &res, &respos)) {
8194 goto onError;
8195 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 else
8198 /* done with this character => adjust input position */
8199 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008203 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008204 if (_PyBytes_Resize(&res, respos) < 0)
8205 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008206
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008207 Py_XDECREF(exc);
8208 Py_XDECREF(errorHandler);
8209 return res;
8210
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 Py_XDECREF(res);
8213 Py_XDECREF(exc);
8214 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215 return NULL;
8216}
8217
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008218/* Deprecated */
8219PyObject *
8220PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8221 Py_ssize_t size,
8222 PyObject *mapping,
8223 const char *errors)
8224{
8225 PyObject *result;
8226 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8227 if (unicode == NULL)
8228 return NULL;
8229 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8230 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008231 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232}
8233
Alexander Belopolsky40018472011-02-26 01:02:56 +00008234PyObject *
8235PyUnicode_AsCharmapString(PyObject *unicode,
8236 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237{
8238 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 PyErr_BadArgument();
8240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008242 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243}
8244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008246static void
8247make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249 Py_ssize_t startpos, Py_ssize_t endpos,
8250 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 *exceptionObject = _PyUnicodeTranslateError_Create(
8254 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 }
8256 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8258 goto onError;
8259 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8260 goto onError;
8261 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8262 goto onError;
8263 return;
8264 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008265 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266 }
8267}
8268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008270static void
8271raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008273 Py_ssize_t startpos, Py_ssize_t endpos,
8274 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275{
8276 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280}
8281
8282/* error handling callback helper:
8283 build arguments, call the callback and check the arguments,
8284 put the result into newpos and return the replacement string, which
8285 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286static PyObject *
8287unicode_translate_call_errorhandler(const char *errors,
8288 PyObject **errorHandler,
8289 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008291 Py_ssize_t startpos, Py_ssize_t endpos,
8292 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008294 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008296 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 PyObject *restuple;
8298 PyObject *resunicode;
8299
8300 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 }
8305
8306 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310
8311 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008316 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 Py_DECREF(restuple);
8318 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 }
8320 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 &resunicode, &i_newpos)) {
8322 Py_DECREF(restuple);
8323 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008325 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 else
8328 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8331 Py_DECREF(restuple);
8332 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008333 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 Py_INCREF(resunicode);
8335 Py_DECREF(restuple);
8336 return resunicode;
8337}
8338
8339/* Lookup the character ch in the mapping and put the result in result,
8340 which must be decrefed by the caller.
8341 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008342static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344{
Christian Heimes217cfd12007-12-02 14:31:20 +00008345 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346 PyObject *x;
8347
8348 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 x = PyObject_GetItem(mapping, w);
8351 Py_DECREF(w);
8352 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8354 /* No mapping found means: use 1:1 mapping. */
8355 PyErr_Clear();
8356 *result = NULL;
8357 return 0;
8358 } else
8359 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 }
8361 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 *result = x;
8363 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008365 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 long value = PyLong_AS_LONG(x);
8367 long max = PyUnicode_GetMax();
8368 if (value < 0 || value > max) {
8369 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008370 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 Py_DECREF(x);
8372 return -1;
8373 }
8374 *result = x;
8375 return 0;
8376 }
8377 else if (PyUnicode_Check(x)) {
8378 *result = x;
8379 return 0;
8380 }
8381 else {
8382 /* wrong return value */
8383 PyErr_SetString(PyExc_TypeError,
8384 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008385 Py_DECREF(x);
8386 return -1;
8387 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388}
8389/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 if not reallocate and adjust various state variables.
8391 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008392static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008397 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008398 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 /* exponentially overallocate to minimize reallocations */
8400 if (requiredsize < 2 * oldsize)
8401 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008402 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8403 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008405 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 }
8408 return 0;
8409}
8410/* lookup the character, put the result in the output string and adjust
8411 various state variables. Return a new reference to the object that
8412 was put in the output buffer in *result, or Py_None, if the mapping was
8413 undefined (in which case no character was written).
8414 The called must decref result.
8415 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008416static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8418 PyObject *mapping, Py_UCS4 **output,
8419 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008420 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8423 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008431 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 }
8435 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 Py_ssize_t repsize;
8437 if (PyUnicode_READY(*res) == -1)
8438 return -1;
8439 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 if (repsize==1) {
8441 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 }
8444 else if (repsize!=0) {
8445 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 Py_ssize_t requiredsize = *opos +
8447 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 Py_ssize_t i;
8450 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 for(i = 0; i < repsize; i++)
8453 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 }
8456 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 return 0;
8459}
8460
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462_PyUnicode_TranslateCharmap(PyObject *input,
8463 PyObject *mapping,
8464 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 /* input object */
8467 char *idata;
8468 Py_ssize_t size, i;
8469 int kind;
8470 /* output buffer */
8471 Py_UCS4 *output = NULL;
8472 Py_ssize_t osize;
8473 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 char *reason = "character maps to <undefined>";
8477 PyObject *errorHandler = NULL;
8478 PyObject *exc = NULL;
8479 /* the following variable is used for caching string comparisons
8480 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8481 * 3=ignore, 4=xmlcharrefreplace */
8482 int known_errorHandler = -1;
8483
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 PyErr_BadArgument();
8486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 if (PyUnicode_READY(input) == -1)
8490 return NULL;
8491 idata = (char*)PyUnicode_DATA(input);
8492 kind = PyUnicode_KIND(input);
8493 size = PyUnicode_GET_LENGTH(input);
8494 i = 0;
8495
8496 if (size == 0) {
8497 Py_INCREF(input);
8498 return input;
8499 }
8500
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 /* allocate enough for a simple 1:1 translation without
8502 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 osize = size;
8504 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8505 opos = 0;
8506 if (output == NULL) {
8507 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* try to encode it */
8513 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 if (charmaptranslate_output(input, i, mapping,
8515 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 Py_XDECREF(x);
8517 goto onError;
8518 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008519 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 else { /* untranslatable character */
8523 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8524 Py_ssize_t repsize;
8525 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 Py_ssize_t collstart = i;
8529 Py_ssize_t collend = i+1;
8530 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 while (collend < size) {
8534 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 goto onError;
8536 Py_XDECREF(x);
8537 if (x!=Py_None)
8538 break;
8539 ++collend;
8540 }
8541 /* cache callback name lookup
8542 * (if not done yet, i.e. it's the first error) */
8543 if (known_errorHandler==-1) {
8544 if ((errors==NULL) || (!strcmp(errors, "strict")))
8545 known_errorHandler = 1;
8546 else if (!strcmp(errors, "replace"))
8547 known_errorHandler = 2;
8548 else if (!strcmp(errors, "ignore"))
8549 known_errorHandler = 3;
8550 else if (!strcmp(errors, "xmlcharrefreplace"))
8551 known_errorHandler = 4;
8552 else
8553 known_errorHandler = 0;
8554 }
8555 switch (known_errorHandler) {
8556 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 raise_translate_exception(&exc, input, collstart,
8558 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 case 2: /* replace */
8561 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 for (coll = collstart; coll<collend; coll++)
8563 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 /* fall through */
8565 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 break;
8568 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 /* generate replacement (temporarily (mis)uses i) */
8570 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 char buffer[2+29+1+1];
8572 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8574 if (charmaptranslate_makespace(&output, &osize,
8575 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 goto onError;
8577 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 break;
8582 default:
8583 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 reason, input, &exc,
8585 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008586 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008588 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008589 Py_DECREF(repunicode);
8590 goto onError;
8591 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 repsize = PyUnicode_GET_LENGTH(repunicode);
8594 if (charmaptranslate_makespace(&output, &osize,
8595 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 Py_DECREF(repunicode);
8597 goto onError;
8598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 for (uni2 = 0; repsize-->0; ++uni2)
8600 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8601 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008603 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008604 }
8605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8607 if (!res)
8608 goto onError;
8609 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610 Py_XDECREF(exc);
8611 Py_XDECREF(errorHandler);
8612 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 Py_XDECREF(exc);
8617 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 return NULL;
8619}
8620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621/* Deprecated. Use PyUnicode_Translate instead. */
8622PyObject *
8623PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8624 Py_ssize_t size,
8625 PyObject *mapping,
8626 const char *errors)
8627{
Christian Heimes5f520f42012-09-11 14:03:25 +02008628 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8630 if (!unicode)
8631 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008632 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8633 Py_DECREF(unicode);
8634 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635}
8636
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637PyObject *
8638PyUnicode_Translate(PyObject *str,
8639 PyObject *mapping,
8640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641{
8642 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008643
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 str = PyUnicode_FromObject(str);
8645 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008646 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 Py_DECREF(str);
8649 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650}
Tim Petersced69f82003-09-16 20:30:58 +00008651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008653fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654{
8655 /* No need to call PyUnicode_READY(self) because this function is only
8656 called as a callback from fixup() which does it already. */
8657 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8658 const int kind = PyUnicode_KIND(self);
8659 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008660 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008661 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 Py_ssize_t i;
8663
8664 for (i = 0; i < len; ++i) {
8665 ch = PyUnicode_READ(kind, data, i);
8666 fixed = 0;
8667 if (ch > 127) {
8668 if (Py_UNICODE_ISSPACE(ch))
8669 fixed = ' ';
8670 else {
8671 const int decimal = Py_UNICODE_TODECIMAL(ch);
8672 if (decimal >= 0)
8673 fixed = '0' + decimal;
8674 }
8675 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008676 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008677 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 PyUnicode_WRITE(kind, data, i, fixed);
8679 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008680 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008681 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 }
8684
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008685 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686}
8687
8688PyObject *
8689_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8690{
8691 if (!PyUnicode_Check(unicode)) {
8692 PyErr_BadInternalCall();
8693 return NULL;
8694 }
8695 if (PyUnicode_READY(unicode) == -1)
8696 return NULL;
8697 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8698 /* If the string is already ASCII, just return the same string */
8699 Py_INCREF(unicode);
8700 return unicode;
8701 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008702 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703}
8704
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008705PyObject *
8706PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8707 Py_ssize_t length)
8708{
Victor Stinnerf0124502011-11-21 23:12:56 +01008709 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008710 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008711 Py_UCS4 maxchar;
8712 enum PyUnicode_Kind kind;
8713 void *data;
8714
Victor Stinner99d7ad02012-02-22 13:37:39 +01008715 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008716 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008717 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008718 if (ch > 127) {
8719 int decimal = Py_UNICODE_TODECIMAL(ch);
8720 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008721 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008722 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008723 }
8724 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008725
8726 /* Copy to a new string */
8727 decimal = PyUnicode_New(length, maxchar);
8728 if (decimal == NULL)
8729 return decimal;
8730 kind = PyUnicode_KIND(decimal);
8731 data = PyUnicode_DATA(decimal);
8732 /* Iterate over code points */
8733 for (i = 0; i < length; i++) {
8734 Py_UNICODE ch = s[i];
8735 if (ch > 127) {
8736 int decimal = Py_UNICODE_TODECIMAL(ch);
8737 if (decimal >= 0)
8738 ch = '0' + decimal;
8739 }
8740 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008742 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008743}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008744/* --- Decimal Encoder ---------------------------------------------------- */
8745
Alexander Belopolsky40018472011-02-26 01:02:56 +00008746int
8747PyUnicode_EncodeDecimal(Py_UNICODE *s,
8748 Py_ssize_t length,
8749 char *output,
8750 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008751{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008752 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008753 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008754 enum PyUnicode_Kind kind;
8755 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008756
8757 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 PyErr_BadArgument();
8759 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008760 }
8761
Victor Stinner42bf7752011-11-21 22:52:58 +01008762 unicode = PyUnicode_FromUnicode(s, length);
8763 if (unicode == NULL)
8764 return -1;
8765
Benjamin Petersonbac79492012-01-14 13:34:47 -05008766 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008767 Py_DECREF(unicode);
8768 return -1;
8769 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008770 kind = PyUnicode_KIND(unicode);
8771 data = PyUnicode_DATA(unicode);
8772
Victor Stinnerb84d7232011-11-22 01:50:07 +01008773 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008774 PyObject *exc;
8775 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008777 Py_ssize_t startpos;
8778
8779 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008780
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008782 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008783 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008785 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 decimal = Py_UNICODE_TODECIMAL(ch);
8787 if (decimal >= 0) {
8788 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008789 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 continue;
8791 }
8792 if (0 < ch && ch < 256) {
8793 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008794 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 continue;
8796 }
Victor Stinner6345be92011-11-25 20:09:01 +01008797
Victor Stinner42bf7752011-11-21 22:52:58 +01008798 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008799 exc = NULL;
8800 raise_encode_exception(&exc, "decimal", unicode,
8801 startpos, startpos+1,
8802 "invalid decimal Unicode string");
8803 Py_XDECREF(exc);
8804 Py_DECREF(unicode);
8805 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008806 }
8807 /* 0-terminate the output string */
8808 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008809 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008810 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008811}
8812
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813/* --- Helpers ------------------------------------------------------------ */
8814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008816any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 Py_ssize_t start,
8818 Py_ssize_t end)
8819{
8820 int kind1, kind2, kind;
8821 void *buf1, *buf2;
8822 Py_ssize_t len1, len2, result;
8823
8824 kind1 = PyUnicode_KIND(s1);
8825 kind2 = PyUnicode_KIND(s2);
8826 kind = kind1 > kind2 ? kind1 : kind2;
8827 buf1 = PyUnicode_DATA(s1);
8828 buf2 = PyUnicode_DATA(s2);
8829 if (kind1 != kind)
8830 buf1 = _PyUnicode_AsKind(s1, kind);
8831 if (!buf1)
8832 return -2;
8833 if (kind2 != kind)
8834 buf2 = _PyUnicode_AsKind(s2, kind);
8835 if (!buf2) {
8836 if (kind1 != kind) PyMem_Free(buf1);
8837 return -2;
8838 }
8839 len1 = PyUnicode_GET_LENGTH(s1);
8840 len2 = PyUnicode_GET_LENGTH(s2);
8841
Victor Stinner794d5672011-10-10 03:21:36 +02008842 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008843 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008844 case PyUnicode_1BYTE_KIND:
8845 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8846 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8847 else
8848 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8849 break;
8850 case PyUnicode_2BYTE_KIND:
8851 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8852 break;
8853 case PyUnicode_4BYTE_KIND:
8854 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8855 break;
8856 default:
8857 assert(0); result = -2;
8858 }
8859 }
8860 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008861 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008862 case PyUnicode_1BYTE_KIND:
8863 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8864 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8865 else
8866 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8867 break;
8868 case PyUnicode_2BYTE_KIND:
8869 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8870 break;
8871 case PyUnicode_4BYTE_KIND:
8872 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8873 break;
8874 default:
8875 assert(0); result = -2;
8876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 }
8878
8879 if (kind1 != kind)
8880 PyMem_Free(buf1);
8881 if (kind2 != kind)
8882 PyMem_Free(buf2);
8883
8884 return result;
8885}
8886
8887Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008888_PyUnicode_InsertThousandsGrouping(
8889 PyObject *unicode, Py_ssize_t index,
8890 Py_ssize_t n_buffer,
8891 void *digits, Py_ssize_t n_digits,
8892 Py_ssize_t min_width,
8893 const char *grouping, PyObject *thousands_sep,
8894 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895{
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008897 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008898 Py_ssize_t thousands_sep_len;
8899 Py_ssize_t len;
8900
8901 if (unicode != NULL) {
8902 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008903 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008904 }
8905 else {
8906 kind = PyUnicode_1BYTE_KIND;
8907 data = NULL;
8908 }
8909 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8910 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8911 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8912 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008913 if (thousands_sep_kind < kind) {
8914 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8915 if (!thousands_sep_data)
8916 return -1;
8917 }
8918 else {
8919 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8920 if (!data)
8921 return -1;
8922 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008923 }
8924
Benjamin Petersonead6b532011-12-20 17:23:42 -06008925 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008927 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008928 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008929 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008930 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008931 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008932 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008933 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008934 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008935 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008936 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008937 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008939 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008940 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008941 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008942 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008943 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008945 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008946 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008947 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008948 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008949 break;
8950 default:
8951 assert(0);
8952 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008954 if (unicode != NULL && thousands_sep_kind != kind) {
8955 if (thousands_sep_kind < kind)
8956 PyMem_Free(thousands_sep_data);
8957 else
8958 PyMem_Free(data);
8959 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008960 if (unicode == NULL) {
8961 *maxchar = 127;
8962 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07008963 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02008964 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008965 }
8966 }
8967 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968}
8969
8970
Thomas Wouters477c8d52006-05-27 19:21:47 +00008971/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008972#define ADJUST_INDICES(start, end, len) \
8973 if (end > len) \
8974 end = len; \
8975 else if (end < 0) { \
8976 end += len; \
8977 if (end < 0) \
8978 end = 0; \
8979 } \
8980 if (start < 0) { \
8981 start += len; \
8982 if (start < 0) \
8983 start = 0; \
8984 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008985
Alexander Belopolsky40018472011-02-26 01:02:56 +00008986Py_ssize_t
8987PyUnicode_Count(PyObject *str,
8988 PyObject *substr,
8989 Py_ssize_t start,
8990 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008992 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008993 PyObject* str_obj;
8994 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 int kind1, kind2, kind;
8996 void *buf1 = NULL, *buf2 = NULL;
8997 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008998
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008999 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009000 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009002 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009003 if (!sub_obj) {
9004 Py_DECREF(str_obj);
9005 return -1;
9006 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009007 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009008 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 Py_DECREF(str_obj);
9010 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 }
Tim Petersced69f82003-09-16 20:30:58 +00009012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 kind1 = PyUnicode_KIND(str_obj);
9014 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009015 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009018 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009019 if (kind2 > kind) {
9020 Py_DECREF(sub_obj);
9021 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009022 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009023 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009024 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 if (!buf2)
9027 goto onError;
9028 len1 = PyUnicode_GET_LENGTH(str_obj);
9029 len2 = PyUnicode_GET_LENGTH(sub_obj);
9030
9031 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009032 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009034 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9035 result = asciilib_count(
9036 ((Py_UCS1*)buf1) + start, end - start,
9037 buf2, len2, PY_SSIZE_T_MAX
9038 );
9039 else
9040 result = ucs1lib_count(
9041 ((Py_UCS1*)buf1) + start, end - start,
9042 buf2, len2, PY_SSIZE_T_MAX
9043 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 break;
9045 case PyUnicode_2BYTE_KIND:
9046 result = ucs2lib_count(
9047 ((Py_UCS2*)buf1) + start, end - start,
9048 buf2, len2, PY_SSIZE_T_MAX
9049 );
9050 break;
9051 case PyUnicode_4BYTE_KIND:
9052 result = ucs4lib_count(
9053 ((Py_UCS4*)buf1) + start, end - start,
9054 buf2, len2, PY_SSIZE_T_MAX
9055 );
9056 break;
9057 default:
9058 assert(0); result = 0;
9059 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009060
9061 Py_DECREF(sub_obj);
9062 Py_DECREF(str_obj);
9063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 if (kind2 != kind)
9065 PyMem_Free(buf2);
9066
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 onError:
9069 Py_DECREF(sub_obj);
9070 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 if (kind2 != kind && buf2)
9072 PyMem_Free(buf2);
9073 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074}
9075
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076Py_ssize_t
9077PyUnicode_Find(PyObject *str,
9078 PyObject *sub,
9079 Py_ssize_t start,
9080 Py_ssize_t end,
9081 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009083 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009084
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009086 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009088 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009089 if (!sub) {
9090 Py_DECREF(str);
9091 return -2;
9092 }
9093 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9094 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 Py_DECREF(str);
9096 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 }
Tim Petersced69f82003-09-16 20:30:58 +00009098
Victor Stinner794d5672011-10-10 03:21:36 +02009099 result = any_find_slice(direction,
9100 str, sub, start, end
9101 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009102
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009104 Py_DECREF(sub);
9105
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106 return result;
9107}
9108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109Py_ssize_t
9110PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9111 Py_ssize_t start, Py_ssize_t end,
9112 int direction)
9113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009115 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 if (PyUnicode_READY(str) == -1)
9117 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009118 if (start < 0 || end < 0) {
9119 PyErr_SetString(PyExc_IndexError, "string index out of range");
9120 return -2;
9121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 if (end > PyUnicode_GET_LENGTH(str))
9123 end = PyUnicode_GET_LENGTH(str);
9124 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009125 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9126 kind, end-start, ch, direction);
9127 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009129 else
9130 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131}
9132
Alexander Belopolsky40018472011-02-26 01:02:56 +00009133static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009134tailmatch(PyObject *self,
9135 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009136 Py_ssize_t start,
9137 Py_ssize_t end,
9138 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 int kind_self;
9141 int kind_sub;
9142 void *data_self;
9143 void *data_sub;
9144 Py_ssize_t offset;
9145 Py_ssize_t i;
9146 Py_ssize_t end_sub;
9147
9148 if (PyUnicode_READY(self) == -1 ||
9149 PyUnicode_READY(substring) == -1)
9150 return 0;
9151
9152 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 return 1;
9154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9156 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 kind_self = PyUnicode_KIND(self);
9161 data_self = PyUnicode_DATA(self);
9162 kind_sub = PyUnicode_KIND(substring);
9163 data_sub = PyUnicode_DATA(substring);
9164 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9165
9166 if (direction > 0)
9167 offset = end;
9168 else
9169 offset = start;
9170
9171 if (PyUnicode_READ(kind_self, data_self, offset) ==
9172 PyUnicode_READ(kind_sub, data_sub, 0) &&
9173 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9174 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9175 /* If both are of the same kind, memcmp is sufficient */
9176 if (kind_self == kind_sub) {
9177 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009178 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 data_sub,
9180 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009181 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 }
9183 /* otherwise we have to compare each character by first accesing it */
9184 else {
9185 /* We do not need to compare 0 and len(substring)-1 because
9186 the if statement above ensured already that they are equal
9187 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009188 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 for (i = 1; i < end_sub; ++i) {
9190 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9191 PyUnicode_READ(kind_sub, data_sub, i))
9192 return 0;
9193 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 }
9197
9198 return 0;
9199}
9200
Alexander Belopolsky40018472011-02-26 01:02:56 +00009201Py_ssize_t
9202PyUnicode_Tailmatch(PyObject *str,
9203 PyObject *substr,
9204 Py_ssize_t start,
9205 Py_ssize_t end,
9206 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009208 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009209
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 str = PyUnicode_FromObject(str);
9211 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009212 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 substr = PyUnicode_FromObject(substr);
9214 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 Py_DECREF(str);
9216 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217 }
Tim Petersced69f82003-09-16 20:30:58 +00009218
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009219 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 Py_DECREF(str);
9222 Py_DECREF(substr);
9223 return result;
9224}
9225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226/* Apply fixfct filter to the Unicode object self and return a
9227 reference to the modified object */
9228
Alexander Belopolsky40018472011-02-26 01:02:56 +00009229static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009230fixup(PyObject *self,
9231 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 PyObject *u;
9234 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009235 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009237 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009240 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 /* fix functions return the new maximum character in a string,
9243 if the kind of the resulting unicode object does not change,
9244 everything is fine. Otherwise we need to change the string kind
9245 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009246 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009247
9248 if (maxchar_new == 0) {
9249 /* no changes */;
9250 if (PyUnicode_CheckExact(self)) {
9251 Py_DECREF(u);
9252 Py_INCREF(self);
9253 return self;
9254 }
9255 else
9256 return u;
9257 }
9258
Victor Stinnere6abb482012-05-02 01:15:40 +02009259 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260
Victor Stinnereaab6042011-12-11 22:22:39 +01009261 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009263
9264 /* In case the maximum character changed, we need to
9265 convert the string to the new category. */
9266 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9267 if (v == NULL) {
9268 Py_DECREF(u);
9269 return NULL;
9270 }
9271 if (maxchar_new > maxchar_old) {
9272 /* If the maxchar increased so that the kind changed, not all
9273 characters are representable anymore and we need to fix the
9274 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009275 _PyUnicode_FastCopyCharacters(v, 0,
9276 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009277 maxchar_old = fixfct(v);
9278 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 }
9280 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009281 _PyUnicode_FastCopyCharacters(v, 0,
9282 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009284 Py_DECREF(u);
9285 assert(_PyUnicode_CheckConsistency(v, 1));
9286 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287}
9288
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009289static PyObject *
9290ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009292 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9293 char *resdata, *data = PyUnicode_DATA(self);
9294 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009295
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 res = PyUnicode_New(len, 127);
9297 if (res == NULL)
9298 return NULL;
9299 resdata = PyUnicode_DATA(res);
9300 if (lower)
9301 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009303 _Py_bytes_upper(resdata, data, len);
9304 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305}
9306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009310 Py_ssize_t j;
9311 int final_sigma;
9312 Py_UCS4 c;
9313 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009314
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009315 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9316
9317 where ! is a negation and \p{xxx} is a character with property xxx.
9318 */
9319 for (j = i - 1; j >= 0; j--) {
9320 c = PyUnicode_READ(kind, data, j);
9321 if (!_PyUnicode_IsCaseIgnorable(c))
9322 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009323 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009324 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9325 if (final_sigma) {
9326 for (j = i + 1; j < length; j++) {
9327 c = PyUnicode_READ(kind, data, j);
9328 if (!_PyUnicode_IsCaseIgnorable(c))
9329 break;
9330 }
9331 final_sigma = j == length || !_PyUnicode_IsCased(c);
9332 }
9333 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334}
9335
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336static int
9337lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9338 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009340 /* Obscure special case. */
9341 if (c == 0x3A3) {
9342 mapped[0] = handle_capital_sigma(kind, data, length, i);
9343 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009345 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346}
9347
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009348static Py_ssize_t
9349do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009351 Py_ssize_t i, k = 0;
9352 int n_res, j;
9353 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009354
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009355 c = PyUnicode_READ(kind, data, 0);
9356 n_res = _PyUnicode_ToUpperFull(c, mapped);
9357 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009358 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009359 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009361 for (i = 1; i < length; i++) {
9362 c = PyUnicode_READ(kind, data, i);
9363 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9364 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009365 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009366 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009367 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009368 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009369 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370}
9371
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009372static Py_ssize_t
9373do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9374 Py_ssize_t i, k = 0;
9375
9376 for (i = 0; i < length; i++) {
9377 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9378 int n_res, j;
9379 if (Py_UNICODE_ISUPPER(c)) {
9380 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9381 }
9382 else if (Py_UNICODE_ISLOWER(c)) {
9383 n_res = _PyUnicode_ToUpperFull(c, mapped);
9384 }
9385 else {
9386 n_res = 1;
9387 mapped[0] = c;
9388 }
9389 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009390 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009391 res[k++] = mapped[j];
9392 }
9393 }
9394 return k;
9395}
9396
9397static Py_ssize_t
9398do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9399 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009401 Py_ssize_t i, k = 0;
9402
9403 for (i = 0; i < length; i++) {
9404 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9405 int n_res, j;
9406 if (lower)
9407 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9408 else
9409 n_res = _PyUnicode_ToUpperFull(c, mapped);
9410 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009411 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009412 res[k++] = mapped[j];
9413 }
9414 }
9415 return k;
9416}
9417
9418static Py_ssize_t
9419do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9420{
9421 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9422}
9423
9424static Py_ssize_t
9425do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9426{
9427 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9428}
9429
Benjamin Petersone51757f2012-01-12 21:10:29 -05009430static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009431do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9432{
9433 Py_ssize_t i, k = 0;
9434
9435 for (i = 0; i < length; i++) {
9436 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9437 Py_UCS4 mapped[3];
9438 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9439 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009440 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009441 res[k++] = mapped[j];
9442 }
9443 }
9444 return k;
9445}
9446
9447static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009448do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9449{
9450 Py_ssize_t i, k = 0;
9451 int previous_is_cased;
9452
9453 previous_is_cased = 0;
9454 for (i = 0; i < length; i++) {
9455 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9456 Py_UCS4 mapped[3];
9457 int n_res, j;
9458
9459 if (previous_is_cased)
9460 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9461 else
9462 n_res = _PyUnicode_ToTitleFull(c, mapped);
9463
9464 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009465 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009466 res[k++] = mapped[j];
9467 }
9468
9469 previous_is_cased = _PyUnicode_IsCased(c);
9470 }
9471 return k;
9472}
9473
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009474static PyObject *
9475case_operation(PyObject *self,
9476 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9477{
9478 PyObject *res = NULL;
9479 Py_ssize_t length, newlength = 0;
9480 int kind, outkind;
9481 void *data, *outdata;
9482 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9483
Benjamin Petersoneea48462012-01-16 14:28:50 -05009484 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009485
9486 kind = PyUnicode_KIND(self);
9487 data = PyUnicode_DATA(self);
9488 length = PyUnicode_GET_LENGTH(self);
Antoine Pitroub6dc9b72014-10-15 23:14:53 +02009489 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009490 PyErr_SetString(PyExc_OverflowError, "string is too long");
9491 return NULL;
9492 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009493 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009494 if (tmp == NULL)
9495 return PyErr_NoMemory();
9496 newlength = perform(kind, data, length, tmp, &maxchar);
9497 res = PyUnicode_New(newlength, maxchar);
9498 if (res == NULL)
9499 goto leave;
9500 tmpend = tmp + newlength;
9501 outdata = PyUnicode_DATA(res);
9502 outkind = PyUnicode_KIND(res);
9503 switch (outkind) {
9504 case PyUnicode_1BYTE_KIND:
9505 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9506 break;
9507 case PyUnicode_2BYTE_KIND:
9508 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9509 break;
9510 case PyUnicode_4BYTE_KIND:
9511 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9512 break;
9513 default:
9514 assert(0);
9515 break;
9516 }
9517 leave:
9518 PyMem_FREE(tmp);
9519 return res;
9520}
9521
Tim Peters8ce9f162004-08-27 01:49:32 +00009522PyObject *
9523PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009526 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009528 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009529 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9530 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009531 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009533 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009535 int use_memcpy;
9536 unsigned char *res_data = NULL, *sep_data = NULL;
9537 PyObject *last_obj;
9538 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009540 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009541 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009542 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009543 }
9544
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009545 /* NOTE: the following code can't call back into Python code,
9546 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009547 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009548
Tim Peters05eba1f2004-08-27 21:32:02 +00009549 seqlen = PySequence_Fast_GET_SIZE(fseq);
9550 /* If empty sequence, return u"". */
9551 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009552 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009553 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009554 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009555
Tim Peters05eba1f2004-08-27 21:32:02 +00009556 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009557 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009558 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009559 if (seqlen == 1) {
9560 if (PyUnicode_CheckExact(items[0])) {
9561 res = items[0];
9562 Py_INCREF(res);
9563 Py_DECREF(fseq);
9564 return res;
9565 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009566 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009567 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009568 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009569 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009570 /* Set up sep and seplen */
9571 if (separator == NULL) {
9572 /* fall back to a blank space separator */
9573 sep = PyUnicode_FromOrdinal(' ');
9574 if (!sep)
9575 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009576 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009577 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009578 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009579 else {
9580 if (!PyUnicode_Check(separator)) {
9581 PyErr_Format(PyExc_TypeError,
9582 "separator: expected str instance,"
9583 " %.80s found",
9584 Py_TYPE(separator)->tp_name);
9585 goto onError;
9586 }
9587 if (PyUnicode_READY(separator))
9588 goto onError;
9589 sep = separator;
9590 seplen = PyUnicode_GET_LENGTH(separator);
9591 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9592 /* inc refcount to keep this code path symmetric with the
9593 above case of a blank separator */
9594 Py_INCREF(sep);
9595 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009597 }
9598
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009599 /* There are at least two things to join, or else we have a subclass
9600 * of str in the sequence.
9601 * Do a pre-pass to figure out the total amount of space we'll
9602 * need (sz), and see whether all argument are strings.
9603 */
9604 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009605#ifdef Py_DEBUG
9606 use_memcpy = 0;
9607#else
9608 use_memcpy = 1;
9609#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009610 for (i = 0; i < seqlen; i++) {
9611 const Py_ssize_t old_sz = sz;
9612 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 if (!PyUnicode_Check(item)) {
9614 PyErr_Format(PyExc_TypeError,
9615 "sequence item %zd: expected str instance,"
9616 " %.80s found",
9617 i, Py_TYPE(item)->tp_name);
9618 goto onError;
9619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 if (PyUnicode_READY(item) == -1)
9621 goto onError;
9622 sz += PyUnicode_GET_LENGTH(item);
9623 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009624 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009625 if (i != 0)
9626 sz += seplen;
9627 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9628 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009630 goto onError;
9631 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009632 if (use_memcpy && last_obj != NULL) {
9633 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9634 use_memcpy = 0;
9635 }
9636 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009637 }
Tim Petersced69f82003-09-16 20:30:58 +00009638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009640 if (res == NULL)
9641 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009642
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009643 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009644#ifdef Py_DEBUG
9645 use_memcpy = 0;
9646#else
9647 if (use_memcpy) {
9648 res_data = PyUnicode_1BYTE_DATA(res);
9649 kind = PyUnicode_KIND(res);
9650 if (seplen != 0)
9651 sep_data = PyUnicode_1BYTE_DATA(sep);
9652 }
9653#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009655 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009656 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009658 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009659 if (use_memcpy) {
9660 Py_MEMCPY(res_data,
9661 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009662 kind * seplen);
9663 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009664 }
9665 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009666 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009667 res_offset += seplen;
9668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009669 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009670 itemlen = PyUnicode_GET_LENGTH(item);
9671 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009672 if (use_memcpy) {
9673 Py_MEMCPY(res_data,
9674 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009675 kind * itemlen);
9676 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009677 }
9678 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009679 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009680 res_offset += itemlen;
9681 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009682 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009683 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009684 if (use_memcpy)
9685 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009686 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009687 else
9688 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009689
Tim Peters05eba1f2004-08-27 21:32:02 +00009690 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009692 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009696 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009698 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699 return NULL;
9700}
9701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702#define FILL(kind, data, value, start, length) \
9703 do { \
9704 Py_ssize_t i_ = 0; \
9705 assert(kind != PyUnicode_WCHAR_KIND); \
9706 switch ((kind)) { \
9707 case PyUnicode_1BYTE_KIND: { \
9708 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009709 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 break; \
9711 } \
9712 case PyUnicode_2BYTE_KIND: { \
9713 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9714 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9715 break; \
9716 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009717 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9719 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9720 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009721 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 } \
9723 } \
9724 } while (0)
9725
Victor Stinnerd3f08822012-05-29 12:57:52 +02009726void
9727_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9728 Py_UCS4 fill_char)
9729{
9730 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9731 const void *data = PyUnicode_DATA(unicode);
9732 assert(PyUnicode_IS_READY(unicode));
9733 assert(unicode_modifiable(unicode));
9734 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9735 assert(start >= 0);
9736 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9737 FILL(kind, data, fill_char, start, length);
9738}
9739
Victor Stinner3fe55312012-01-04 00:33:50 +01009740Py_ssize_t
9741PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9742 Py_UCS4 fill_char)
9743{
9744 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009745
9746 if (!PyUnicode_Check(unicode)) {
9747 PyErr_BadInternalCall();
9748 return -1;
9749 }
9750 if (PyUnicode_READY(unicode) == -1)
9751 return -1;
9752 if (unicode_check_modifiable(unicode))
9753 return -1;
9754
Victor Stinnerd3f08822012-05-29 12:57:52 +02009755 if (start < 0) {
9756 PyErr_SetString(PyExc_IndexError, "string index out of range");
9757 return -1;
9758 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009759 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9760 PyErr_SetString(PyExc_ValueError,
9761 "fill character is bigger than "
9762 "the string maximum character");
9763 return -1;
9764 }
9765
9766 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9767 length = Py_MIN(maxlen, length);
9768 if (length <= 0)
9769 return 0;
9770
Victor Stinnerd3f08822012-05-29 12:57:52 +02009771 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009772 return length;
9773}
9774
Victor Stinner9310abb2011-10-05 00:59:23 +02009775static PyObject *
9776pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009777 Py_ssize_t left,
9778 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 PyObject *u;
9782 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009783 int kind;
9784 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785
9786 if (left < 0)
9787 left = 0;
9788 if (right < 0)
9789 right = 0;
9790
Victor Stinnerc4b49542011-12-11 22:44:26 +01009791 if (left == 0 && right == 0)
9792 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9795 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009796 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9797 return NULL;
9798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009800 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009802 if (!u)
9803 return NULL;
9804
9805 kind = PyUnicode_KIND(u);
9806 data = PyUnicode_DATA(u);
9807 if (left)
9808 FILL(kind, data, fill, 0, left);
9809 if (right)
9810 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009811 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009812 assert(_PyUnicode_CheckConsistency(u, 1));
9813 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814}
9815
Alexander Belopolsky40018472011-02-26 01:02:56 +00009816PyObject *
9817PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820
9821 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009822 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009823 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009824 if (PyUnicode_READY(string) == -1) {
9825 Py_DECREF(string);
9826 return NULL;
9827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828
Benjamin Petersonead6b532011-12-20 17:23:42 -06009829 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009831 if (PyUnicode_IS_ASCII(string))
9832 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 PyUnicode_GET_LENGTH(string), keepends);
9835 else
9836 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009837 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009838 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 break;
9840 case PyUnicode_2BYTE_KIND:
9841 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009842 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 PyUnicode_GET_LENGTH(string), keepends);
9844 break;
9845 case PyUnicode_4BYTE_KIND:
9846 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 PyUnicode_GET_LENGTH(string), keepends);
9849 break;
9850 default:
9851 assert(0);
9852 list = 0;
9853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 Py_DECREF(string);
9855 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856}
9857
Alexander Belopolsky40018472011-02-26 01:02:56 +00009858static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009859split(PyObject *self,
9860 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 int kind1, kind2, kind;
9864 void *buf1, *buf2;
9865 Py_ssize_t len1, len2;
9866 PyObject* out;
9867
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009869 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 if (PyUnicode_READY(self) == -1)
9872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009875 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009877 if (PyUnicode_IS_ASCII(self))
9878 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009880 PyUnicode_GET_LENGTH(self), maxcount
9881 );
9882 else
9883 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009884 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009885 PyUnicode_GET_LENGTH(self), maxcount
9886 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 case PyUnicode_2BYTE_KIND:
9888 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009889 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 PyUnicode_GET_LENGTH(self), maxcount
9891 );
9892 case PyUnicode_4BYTE_KIND:
9893 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009894 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 PyUnicode_GET_LENGTH(self), maxcount
9896 );
9897 default:
9898 assert(0);
9899 return NULL;
9900 }
9901
9902 if (PyUnicode_READY(substring) == -1)
9903 return NULL;
9904
9905 kind1 = PyUnicode_KIND(self);
9906 kind2 = PyUnicode_KIND(substring);
9907 kind = kind1 > kind2 ? kind1 : kind2;
9908 buf1 = PyUnicode_DATA(self);
9909 buf2 = PyUnicode_DATA(substring);
9910 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009911 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 if (!buf1)
9913 return NULL;
9914 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 if (!buf2) {
9917 if (kind1 != kind) PyMem_Free(buf1);
9918 return NULL;
9919 }
9920 len1 = PyUnicode_GET_LENGTH(self);
9921 len2 = PyUnicode_GET_LENGTH(substring);
9922
Benjamin Petersonead6b532011-12-20 17:23:42 -06009923 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009925 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9926 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009928 else
9929 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009930 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 break;
9932 case PyUnicode_2BYTE_KIND:
9933 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009934 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 break;
9936 case PyUnicode_4BYTE_KIND:
9937 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009938 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 break;
9940 default:
9941 out = NULL;
9942 }
9943 if (kind1 != kind)
9944 PyMem_Free(buf1);
9945 if (kind2 != kind)
9946 PyMem_Free(buf2);
9947 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948}
9949
Alexander Belopolsky40018472011-02-26 01:02:56 +00009950static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009951rsplit(PyObject *self,
9952 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009953 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009954{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 int kind1, kind2, kind;
9956 void *buf1, *buf2;
9957 Py_ssize_t len1, len2;
9958 PyObject* out;
9959
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009960 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009961 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 if (PyUnicode_READY(self) == -1)
9964 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009967 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009969 if (PyUnicode_IS_ASCII(self))
9970 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009972 PyUnicode_GET_LENGTH(self), maxcount
9973 );
9974 else
9975 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009976 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009977 PyUnicode_GET_LENGTH(self), maxcount
9978 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 case PyUnicode_2BYTE_KIND:
9980 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 PyUnicode_GET_LENGTH(self), maxcount
9983 );
9984 case PyUnicode_4BYTE_KIND:
9985 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009986 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 PyUnicode_GET_LENGTH(self), maxcount
9988 );
9989 default:
9990 assert(0);
9991 return NULL;
9992 }
9993
9994 if (PyUnicode_READY(substring) == -1)
9995 return NULL;
9996
9997 kind1 = PyUnicode_KIND(self);
9998 kind2 = PyUnicode_KIND(substring);
9999 kind = kind1 > kind2 ? kind1 : kind2;
10000 buf1 = PyUnicode_DATA(self);
10001 buf2 = PyUnicode_DATA(substring);
10002 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (!buf1)
10005 return NULL;
10006 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 if (!buf2) {
10009 if (kind1 != kind) PyMem_Free(buf1);
10010 return NULL;
10011 }
10012 len1 = PyUnicode_GET_LENGTH(self);
10013 len2 = PyUnicode_GET_LENGTH(substring);
10014
Benjamin Petersonead6b532011-12-20 17:23:42 -060010015 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010017 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10018 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010019 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010020 else
10021 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 break;
10024 case PyUnicode_2BYTE_KIND:
10025 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010026 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 break;
10028 case PyUnicode_4BYTE_KIND:
10029 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010030 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 break;
10032 default:
10033 out = NULL;
10034 }
10035 if (kind1 != kind)
10036 PyMem_Free(buf1);
10037 if (kind2 != kind)
10038 PyMem_Free(buf2);
10039 return out;
10040}
10041
10042static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10044 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010046 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010048 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10049 return asciilib_find(buf1, len1, buf2, len2, offset);
10050 else
10051 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 case PyUnicode_2BYTE_KIND:
10053 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10054 case PyUnicode_4BYTE_KIND:
10055 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10056 }
10057 assert(0);
10058 return -1;
10059}
10060
10061static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010062anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10063 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010065 switch (kind) {
10066 case PyUnicode_1BYTE_KIND:
10067 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10068 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10069 else
10070 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10071 case PyUnicode_2BYTE_KIND:
10072 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10073 case PyUnicode_4BYTE_KIND:
10074 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10075 }
10076 assert(0);
10077 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010078}
10079
Alexander Belopolsky40018472011-02-26 01:02:56 +000010080static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081replace(PyObject *self, PyObject *str1,
10082 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 PyObject *u;
10085 char *sbuf = PyUnicode_DATA(self);
10086 char *buf1 = PyUnicode_DATA(str1);
10087 char *buf2 = PyUnicode_DATA(str2);
10088 int srelease = 0, release1 = 0, release2 = 0;
10089 int skind = PyUnicode_KIND(self);
10090 int kind1 = PyUnicode_KIND(str1);
10091 int kind2 = PyUnicode_KIND(str2);
10092 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10093 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10094 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010095 int mayshrink;
10096 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097
10098 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010099 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010101 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
Victor Stinner59de0ee2011-10-07 10:01:28 +020010103 if (str1 == str2)
10104 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 if (skind < kind1)
10106 /* substring too wide to be present */
10107 goto nothing;
10108
Victor Stinner49a0a212011-10-12 23:46:10 +020010109 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10110 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10111 /* Replacing str1 with str2 may cause a maxchar reduction in the
10112 result string. */
10113 mayshrink = (maxchar_str2 < maxchar);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010114 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010119 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010121 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010122 Py_UCS4 u1, u2;
10123 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010124 Py_ssize_t index, pos;
10125 char *src;
10126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010128 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10129 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010130 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010133 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010135 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010137
10138 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10139 index = 0;
10140 src = sbuf;
10141 while (--maxcount)
10142 {
10143 pos++;
10144 src += pos * PyUnicode_KIND(self);
10145 slen -= pos;
10146 index += pos;
10147 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10148 if (pos < 0)
10149 break;
10150 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10151 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 }
10153 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 int rkind = skind;
10155 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010156 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (kind1 < rkind) {
10159 /* widen substring */
10160 buf1 = _PyUnicode_AsKind(str1, rkind);
10161 if (!buf1) goto error;
10162 release1 = 1;
10163 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 if (i < 0)
10166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (rkind > kind2) {
10168 /* widen replacement */
10169 buf2 = _PyUnicode_AsKind(str2, rkind);
10170 if (!buf2) goto error;
10171 release2 = 1;
10172 }
10173 else if (rkind < kind2) {
10174 /* widen self and buf1 */
10175 rkind = kind2;
10176 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010177 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 sbuf = _PyUnicode_AsKind(self, rkind);
10179 if (!sbuf) goto error;
10180 srelease = 1;
10181 buf1 = _PyUnicode_AsKind(str1, rkind);
10182 if (!buf1) goto error;
10183 release1 = 1;
10184 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010185 u = PyUnicode_New(slen, maxchar);
10186 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010188 assert(PyUnicode_KIND(u) == rkind);
10189 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010190
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010192 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010195 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010197
10198 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010200 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010202 if (i == -1)
10203 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010204 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010206 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 }
10211 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010213 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 int rkind = skind;
10215 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 buf1 = _PyUnicode_AsKind(str1, rkind);
10220 if (!buf1) goto error;
10221 release1 = 1;
10222 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 if (n == 0)
10225 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010227 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 buf2 = _PyUnicode_AsKind(str2, rkind);
10229 if (!buf2) goto error;
10230 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010233 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 rkind = kind2;
10235 sbuf = _PyUnicode_AsKind(self, rkind);
10236 if (!sbuf) goto error;
10237 srelease = 1;
10238 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010239 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 buf1 = _PyUnicode_AsKind(str1, rkind);
10241 if (!buf1) goto error;
10242 release1 = 1;
10243 }
10244 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10245 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010246 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyErr_SetString(PyExc_OverflowError,
10248 "replace string is too long");
10249 goto error;
10250 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010251 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010252 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010253 _Py_INCREF_UNICODE_EMPTY();
10254 if (!unicode_empty)
10255 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010256 u = unicode_empty;
10257 goto done;
10258 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010259 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 PyErr_SetString(PyExc_OverflowError,
10261 "replace string is too long");
10262 goto error;
10263 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010264 u = PyUnicode_New(new_size, maxchar);
10265 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010267 assert(PyUnicode_KIND(u) == rkind);
10268 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 ires = i = 0;
10270 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010271 while (n-- > 0) {
10272 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010274 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010275 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010276 if (j == -1)
10277 break;
10278 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010280 memcpy(res + rkind * ires,
10281 sbuf + rkind * i,
10282 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 }
10285 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010287 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010289 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010296 memcpy(res + rkind * ires,
10297 sbuf + rkind * i,
10298 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010299 }
10300 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 /* interleave */
10302 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010303 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010305 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 if (--n <= 0)
10308 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010309 memcpy(res + rkind * ires,
10310 sbuf + rkind * i,
10311 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 ires++;
10313 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010315 memcpy(res + rkind * ires,
10316 sbuf + rkind * i,
10317 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010318 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010319 }
10320
10321 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010322 unicode_adjust_maxchar(&u);
10323 if (u == NULL)
10324 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010326
10327 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (srelease)
10329 PyMem_FREE(sbuf);
10330 if (release1)
10331 PyMem_FREE(buf1);
10332 if (release2)
10333 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010334 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010336
Benjamin Peterson29060642009-01-31 22:14:21 +000010337 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010338 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (srelease)
10340 PyMem_FREE(sbuf);
10341 if (release1)
10342 PyMem_FREE(buf1);
10343 if (release2)
10344 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010345 return unicode_result_unchanged(self);
10346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 error:
10348 if (srelease && sbuf)
10349 PyMem_FREE(sbuf);
10350 if (release1 && buf1)
10351 PyMem_FREE(buf1);
10352 if (release2 && buf2)
10353 PyMem_FREE(buf2);
10354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355}
10356
10357/* --- Unicode Object Methods --------------------------------------------- */
10358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010359PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010360 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361\n\
10362Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010363characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364
10365static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010366unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010368 if (PyUnicode_READY(self) == -1)
10369 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010370 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371}
10372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010373PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010374 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375\n\
10376Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010377have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
10379static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010380unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010382 if (PyUnicode_READY(self) == -1)
10383 return NULL;
10384 if (PyUnicode_GET_LENGTH(self) == 0)
10385 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010386 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387}
10388
Benjamin Petersond5890c82012-01-14 13:23:30 -050010389PyDoc_STRVAR(casefold__doc__,
10390 "S.casefold() -> str\n\
10391\n\
10392Return a version of S suitable for caseless comparisons.");
10393
10394static PyObject *
10395unicode_casefold(PyObject *self)
10396{
10397 if (PyUnicode_READY(self) == -1)
10398 return NULL;
10399 if (PyUnicode_IS_ASCII(self))
10400 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010401 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010402}
10403
10404
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010405/* Argument converter. Coerces to a single unicode character */
10406
10407static int
10408convert_uc(PyObject *obj, void *addr)
10409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010411 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010412
Benjamin Peterson14339b62009-01-31 16:36:08 +000010413 uniobj = PyUnicode_FromObject(obj);
10414 if (uniobj == NULL) {
10415 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010417 return 0;
10418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010420 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010422 Py_DECREF(uniobj);
10423 return 0;
10424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010426 Py_DECREF(uniobj);
10427 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010428}
10429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010430PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010433Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010434done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
10436static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010437unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010439 Py_ssize_t marg, left;
10440 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 Py_UCS4 fillchar = ' ';
10442
Victor Stinnere9a29352011-10-01 02:14:59 +020010443 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445
Benjamin Petersonbac79492012-01-14 13:34:47 -050010446 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 return NULL;
10448
Victor Stinnerc4b49542011-12-11 22:44:26 +010010449 if (PyUnicode_GET_LENGTH(self) >= width)
10450 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
Victor Stinnerc4b49542011-12-11 22:44:26 +010010452 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453 left = marg / 2 + (marg & width & 1);
10454
Victor Stinner9310abb2011-10-05 00:59:23 +020010455 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456}
10457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458/* This function assumes that str1 and str2 are readied by the caller. */
10459
Marc-André Lemburge5034372000-08-08 08:04:29 +000010460static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010461unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 int kind1, kind2;
10464 void *data1, *data2;
10465 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 kind1 = PyUnicode_KIND(str1);
10468 kind2 = PyUnicode_KIND(str2);
10469 data1 = PyUnicode_DATA(str1);
10470 data2 = PyUnicode_DATA(str2);
10471 len1 = PyUnicode_GET_LENGTH(str1);
10472 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 for (i = 0; i < len1 && i < len2; ++i) {
10475 Py_UCS4 c1, c2;
10476 c1 = PyUnicode_READ(kind1, data1, i);
10477 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010478
10479 if (c1 != c2)
10480 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010481 }
10482
10483 return (len1 < len2) ? -1 : (len1 != len2);
10484}
10485
Alexander Belopolsky40018472011-02-26 01:02:56 +000010486int
10487PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10490 if (PyUnicode_READY(left) == -1 ||
10491 PyUnicode_READY(right) == -1)
10492 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010493 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010495 PyErr_Format(PyExc_TypeError,
10496 "Can't compare %.100s and %.100s",
10497 left->ob_type->tp_name,
10498 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499 return -1;
10500}
10501
Martin v. Löwis5b222132007-06-10 09:51:05 +000010502int
10503PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 Py_ssize_t i;
10506 int kind;
10507 void *data;
10508 Py_UCS4 chr;
10509
Victor Stinner910337b2011-10-03 03:20:16 +020010510 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (PyUnicode_READY(uni) == -1)
10512 return -1;
10513 kind = PyUnicode_KIND(uni);
10514 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010515 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10517 if (chr != str[i])
10518 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010519 /* This check keeps Python strings that end in '\0' from comparing equal
10520 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010523 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010525 return 0;
10526}
10527
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010528
Benjamin Peterson29060642009-01-31 22:14:21 +000010529#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010530 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010531
Alexander Belopolsky40018472011-02-26 01:02:56 +000010532PyObject *
10533PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010534{
10535 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010536
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010537 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10538 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (PyUnicode_READY(left) == -1 ||
10540 PyUnicode_READY(right) == -1)
10541 return NULL;
10542 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10543 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010544 if (op == Py_EQ) {
10545 Py_INCREF(Py_False);
10546 return Py_False;
10547 }
10548 if (op == Py_NE) {
10549 Py_INCREF(Py_True);
10550 return Py_True;
10551 }
10552 }
10553 if (left == right)
10554 result = 0;
10555 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010556 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010557
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010558 /* Convert the return value to a Boolean */
10559 switch (op) {
10560 case Py_EQ:
10561 v = TEST_COND(result == 0);
10562 break;
10563 case Py_NE:
10564 v = TEST_COND(result != 0);
10565 break;
10566 case Py_LE:
10567 v = TEST_COND(result <= 0);
10568 break;
10569 case Py_GE:
10570 v = TEST_COND(result >= 0);
10571 break;
10572 case Py_LT:
10573 v = TEST_COND(result == -1);
10574 break;
10575 case Py_GT:
10576 v = TEST_COND(result == 1);
10577 break;
10578 default:
10579 PyErr_BadArgument();
10580 return NULL;
10581 }
10582 Py_INCREF(v);
10583 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585
Brian Curtindfc80e32011-08-10 20:28:54 -050010586 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010587}
10588
Alexander Belopolsky40018472011-02-26 01:02:56 +000010589int
10590PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010591{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 int kind1, kind2, kind;
10594 void *buf1, *buf2;
10595 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010596 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010597
10598 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 sub = PyUnicode_FromObject(element);
10600 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010601 PyErr_Format(PyExc_TypeError,
10602 "'in <string>' requires string as left operand, not %s",
10603 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010605 }
10606
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010608 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 Py_DECREF(sub);
10610 return -1;
10611 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010612 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10613 Py_DECREF(sub);
10614 Py_DECREF(str);
10615 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 kind1 = PyUnicode_KIND(str);
10618 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010619 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 buf1 = PyUnicode_DATA(str);
10621 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010622 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010623 if (kind2 > kind) {
10624 Py_DECREF(sub);
10625 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010626 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010627 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010628 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 if (!buf2) {
10631 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010632 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 return -1;
10634 }
10635 len1 = PyUnicode_GET_LENGTH(str);
10636 len2 = PyUnicode_GET_LENGTH(sub);
10637
Benjamin Petersonead6b532011-12-20 17:23:42 -060010638 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 case PyUnicode_1BYTE_KIND:
10640 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10641 break;
10642 case PyUnicode_2BYTE_KIND:
10643 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10644 break;
10645 case PyUnicode_4BYTE_KIND:
10646 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10647 break;
10648 default:
10649 result = -1;
10650 assert(0);
10651 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652
10653 Py_DECREF(str);
10654 Py_DECREF(sub);
10655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (kind2 != kind)
10657 PyMem_Free(buf2);
10658
Guido van Rossum403d68b2000-03-13 15:55:09 +000010659 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010660}
10661
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662/* Concat to string or Unicode object giving a new Unicode object. */
10663
Alexander Belopolsky40018472011-02-26 01:02:56 +000010664PyObject *
10665PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010668 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010669 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
10671 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010674 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010677 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
10679 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010680 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010681 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010684 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010685 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687 }
10688
Victor Stinner488fa492011-12-12 00:01:39 +010010689 u_len = PyUnicode_GET_LENGTH(u);
10690 v_len = PyUnicode_GET_LENGTH(v);
10691 if (u_len > PY_SSIZE_T_MAX - v_len) {
10692 PyErr_SetString(PyExc_OverflowError,
10693 "strings are too large to concat");
10694 goto onError;
10695 }
10696 new_len = u_len + v_len;
10697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010699 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010700 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010703 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010705 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010706 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10707 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708 Py_DECREF(u);
10709 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010710 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714 Py_XDECREF(u);
10715 Py_XDECREF(v);
10716 return NULL;
10717}
10718
Walter Dörwald1ab83302007-05-18 17:15:44 +000010719void
Victor Stinner23e56682011-10-03 03:54:37 +020010720PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010721{
Victor Stinner23e56682011-10-03 03:54:37 +020010722 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010723 Py_UCS4 maxchar, maxchar2;
10724 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010725
10726 if (p_left == NULL) {
10727 if (!PyErr_Occurred())
10728 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010729 return;
10730 }
Victor Stinner23e56682011-10-03 03:54:37 +020010731 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010732 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010733 if (!PyErr_Occurred())
10734 PyErr_BadInternalCall();
10735 goto error;
10736 }
10737
Benjamin Petersonbac79492012-01-14 13:34:47 -050010738 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010739 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010740 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010741 goto error;
10742
Victor Stinner488fa492011-12-12 00:01:39 +010010743 /* Shortcuts */
10744 if (left == unicode_empty) {
10745 Py_DECREF(left);
10746 Py_INCREF(right);
10747 *p_left = right;
10748 return;
10749 }
10750 if (right == unicode_empty)
10751 return;
10752
10753 left_len = PyUnicode_GET_LENGTH(left);
10754 right_len = PyUnicode_GET_LENGTH(right);
10755 if (left_len > PY_SSIZE_T_MAX - right_len) {
10756 PyErr_SetString(PyExc_OverflowError,
10757 "strings are too large to concat");
10758 goto error;
10759 }
10760 new_len = left_len + right_len;
10761
10762 if (unicode_modifiable(left)
10763 && PyUnicode_CheckExact(right)
10764 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010765 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10766 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010767 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010768 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010769 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10770 {
10771 /* append inplace */
10772 if (unicode_resize(p_left, new_len) != 0) {
10773 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10774 * deallocated so it cannot be put back into
10775 * 'variable'. The MemoryError is raised when there
10776 * is no value in 'variable', which might (very
10777 * remotely) be a cause of incompatibilities.
10778 */
10779 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010780 }
Victor Stinner488fa492011-12-12 00:01:39 +010010781 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010782 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010783 }
Victor Stinner488fa492011-12-12 00:01:39 +010010784 else {
10785 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10786 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010787 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010788
Victor Stinner488fa492011-12-12 00:01:39 +010010789 /* Concat the two Unicode strings */
10790 res = PyUnicode_New(new_len, maxchar);
10791 if (res == NULL)
10792 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010793 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10794 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010795 Py_DECREF(left);
10796 *p_left = res;
10797 }
10798 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010799 return;
10800
10801error:
Victor Stinner488fa492011-12-12 00:01:39 +010010802 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010803}
10804
10805void
10806PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010808 PyUnicode_Append(pleft, right);
10809 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010810}
10811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010812PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010813 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010815Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010816string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010817interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818
10819static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010822 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010823 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010824 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 int kind1, kind2, kind;
10827 void *buf1, *buf2;
10828 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
Jesus Ceaac451502011-04-20 17:09:23 +020010830 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10831 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010832 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 kind1 = PyUnicode_KIND(self);
10835 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010836 if (kind2 > kind1)
10837 return PyLong_FromLong(0);
10838 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 buf1 = PyUnicode_DATA(self);
10840 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010842 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (!buf2) {
10844 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 return NULL;
10846 }
10847 len1 = PyUnicode_GET_LENGTH(self);
10848 len2 = PyUnicode_GET_LENGTH(substring);
10849
10850 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010851 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 case PyUnicode_1BYTE_KIND:
10853 iresult = ucs1lib_count(
10854 ((Py_UCS1*)buf1) + start, end - start,
10855 buf2, len2, PY_SSIZE_T_MAX
10856 );
10857 break;
10858 case PyUnicode_2BYTE_KIND:
10859 iresult = ucs2lib_count(
10860 ((Py_UCS2*)buf1) + start, end - start,
10861 buf2, len2, PY_SSIZE_T_MAX
10862 );
10863 break;
10864 case PyUnicode_4BYTE_KIND:
10865 iresult = ucs4lib_count(
10866 ((Py_UCS4*)buf1) + start, end - start,
10867 buf2, len2, PY_SSIZE_T_MAX
10868 );
10869 break;
10870 default:
10871 assert(0); iresult = 0;
10872 }
10873
10874 result = PyLong_FromSsize_t(iresult);
10875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 if (kind2 != kind)
10877 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878
10879 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010880
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881 return result;
10882}
10883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010884PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010885 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010887Encode S using the codec registered for encoding. Default encoding\n\
10888is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010889handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010890a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10891'xmlcharrefreplace' as well as any other name registered with\n\
10892codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
10894static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010895unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010897 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898 char *encoding = NULL;
10899 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010900
Benjamin Peterson308d6372009-09-18 21:42:35 +000010901 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10902 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010904 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010905}
10906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010907PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909\n\
10910Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010911If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
10913static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010914unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 Py_ssize_t i, j, line_pos, src_len, incr;
10917 Py_UCS4 ch;
10918 PyObject *u;
10919 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010921 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010922 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
10924 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926
Antoine Pitrou22425222011-10-04 19:10:51 +020010927 if (PyUnicode_READY(self) == -1)
10928 return NULL;
10929
Thomas Wouters7e474022000-07-16 12:04:32 +000010930 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010931 src_len = PyUnicode_GET_LENGTH(self);
10932 i = j = line_pos = 0;
10933 kind = PyUnicode_KIND(self);
10934 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010935 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010936 for (; i < src_len; i++) {
10937 ch = PyUnicode_READ(kind, src_data, i);
10938 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010939 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010941 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010943 goto overflow;
10944 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010946 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010950 goto overflow;
10951 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010953 if (ch == '\n' || ch == '\r')
10954 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010956 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010957 if (!found)
10958 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010959
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010961 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 if (!u)
10963 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010964 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
Antoine Pitroue71d5742011-10-04 15:55:09 +020010966 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
Antoine Pitroue71d5742011-10-04 15:55:09 +020010968 for (; i < src_len; i++) {
10969 ch = PyUnicode_READ(kind, src_data, i);
10970 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010972 incr = tabsize - (line_pos % tabsize);
10973 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010974 FILL(kind, dest_data, ' ', j, incr);
10975 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010977 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010978 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010979 line_pos++;
10980 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010981 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010982 if (ch == '\n' || ch == '\r')
10983 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 }
10986 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010987 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010988
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010990 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992}
10993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010994PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996\n\
10997Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010998such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999arguments start and end are interpreted as in slice notation.\n\
11000\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011001Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
11003static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011007 Py_ssize_t start;
11008 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011009 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
Jesus Ceaac451502011-04-20 17:09:23 +020011011 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11012 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 if (PyUnicode_READY(self) == -1)
11016 return NULL;
11017 if (PyUnicode_READY(substring) == -1)
11018 return NULL;
11019
Victor Stinner7931d9a2011-11-04 00:22:48 +010011020 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
11022 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 if (result == -2)
11025 return NULL;
11026
Christian Heimes217cfd12007-12-02 14:31:20 +000011027 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028}
11029
11030static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011031unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011033 void *data;
11034 enum PyUnicode_Kind kind;
11035 Py_UCS4 ch;
11036 PyObject *res;
11037
11038 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11039 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011041 }
11042 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11043 PyErr_SetString(PyExc_IndexError, "string index out of range");
11044 return NULL;
11045 }
11046 kind = PyUnicode_KIND(self);
11047 data = PyUnicode_DATA(self);
11048 ch = PyUnicode_READ(kind, data, index);
11049 if (ch < 256)
11050 return get_latin1_char(ch);
11051
11052 res = PyUnicode_New(1, ch);
11053 if (res == NULL)
11054 return NULL;
11055 kind = PyUnicode_KIND(res);
11056 data = PyUnicode_DATA(res);
11057 PyUnicode_WRITE(kind, data, 0, ch);
11058 assert(_PyUnicode_CheckConsistency(res, 1));
11059 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060}
11061
Guido van Rossumc2504932007-09-18 19:42:40 +000011062/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011063 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011064static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011065unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Guido van Rossumc2504932007-09-18 19:42:40 +000011067 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011068 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011069
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011070#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011071 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011072#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (_PyUnicode_HASH(self) != -1)
11074 return _PyUnicode_HASH(self);
11075 if (PyUnicode_READY(self) == -1)
11076 return -1;
11077 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011078 /*
11079 We make the hash of the empty string be 0, rather than using
11080 (prefix ^ suffix), since this slightly obfuscates the hash secret
11081 */
11082 if (len == 0) {
11083 _PyUnicode_HASH(self) = 0;
11084 return 0;
11085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086
11087 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011088#define HASH(P) \
11089 x ^= (Py_uhash_t) *P << 7; \
11090 while (--len >= 0) \
11091 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092
Georg Brandl2fb477c2012-02-21 00:33:36 +010011093 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 switch (PyUnicode_KIND(self)) {
11095 case PyUnicode_1BYTE_KIND: {
11096 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11097 HASH(c);
11098 break;
11099 }
11100 case PyUnicode_2BYTE_KIND: {
11101 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11102 HASH(s);
11103 break;
11104 }
11105 default: {
11106 Py_UCS4 *l;
11107 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11108 "Impossible switch case in unicode_hash");
11109 l = PyUnicode_4BYTE_DATA(self);
11110 HASH(l);
11111 break;
11112 }
11113 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011114 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11115 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116
Guido van Rossumc2504932007-09-18 19:42:40 +000011117 if (x == -1)
11118 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011120 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011124PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011127Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128
11129static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011132 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011133 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011134 Py_ssize_t start;
11135 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Jesus Ceaac451502011-04-20 17:09:23 +020011137 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11138 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (PyUnicode_READY(self) == -1)
11142 return NULL;
11143 if (PyUnicode_READY(substring) == -1)
11144 return NULL;
11145
Victor Stinner7931d9a2011-11-04 00:22:48 +010011146 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
11148 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (result == -2)
11151 return NULL;
11152
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 if (result < 0) {
11154 PyErr_SetString(PyExc_ValueError, "substring not found");
11155 return NULL;
11156 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011157
Christian Heimes217cfd12007-12-02 14:31:20 +000011158 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159}
11160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011161PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011162 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011164Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011165at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166
11167static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011168unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 Py_ssize_t i, length;
11171 int kind;
11172 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173 int cased;
11174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 if (PyUnicode_READY(self) == -1)
11176 return NULL;
11177 length = PyUnicode_GET_LENGTH(self);
11178 kind = PyUnicode_KIND(self);
11179 data = PyUnicode_DATA(self);
11180
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 if (length == 1)
11183 return PyBool_FromLong(
11184 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011186 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011189
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 for (i = 0; i < length; i++) {
11192 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011193
Benjamin Peterson29060642009-01-31 22:14:21 +000011194 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11195 return PyBool_FromLong(0);
11196 else if (!cased && Py_UNICODE_ISLOWER(ch))
11197 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011199 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200}
11201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011202PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011203 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011205Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011206at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011209unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 Py_ssize_t i, length;
11212 int kind;
11213 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214 int cased;
11215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (PyUnicode_READY(self) == -1)
11217 return NULL;
11218 length = PyUnicode_GET_LENGTH(self);
11219 kind = PyUnicode_KIND(self);
11220 data = PyUnicode_DATA(self);
11221
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 if (length == 1)
11224 return PyBool_FromLong(
11225 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011227 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011229 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011230
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 for (i = 0; i < length; i++) {
11233 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011234
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11236 return PyBool_FromLong(0);
11237 else if (!cased && Py_UNICODE_ISUPPER(ch))
11238 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011240 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241}
11242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011243PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011246Return True if S is a titlecased string and there is at least one\n\
11247character in S, i.e. upper- and titlecase characters may only\n\
11248follow uncased characters and lowercase characters only cased ones.\n\
11249Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
11251static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011252unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 Py_ssize_t i, length;
11255 int kind;
11256 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 int cased, previous_is_cased;
11258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 if (PyUnicode_READY(self) == -1)
11260 return NULL;
11261 length = PyUnicode_GET_LENGTH(self);
11262 kind = PyUnicode_KIND(self);
11263 data = PyUnicode_DATA(self);
11264
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 if (length == 1) {
11267 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11268 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11269 (Py_UNICODE_ISUPPER(ch) != 0));
11270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011272 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 cased = 0;
11277 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 for (i = 0; i < length; i++) {
11279 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011280
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11282 if (previous_is_cased)
11283 return PyBool_FromLong(0);
11284 previous_is_cased = 1;
11285 cased = 1;
11286 }
11287 else if (Py_UNICODE_ISLOWER(ch)) {
11288 if (!previous_is_cased)
11289 return PyBool_FromLong(0);
11290 previous_is_cased = 1;
11291 cased = 1;
11292 }
11293 else
11294 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011296 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297}
11298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011299PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011302Return True if all characters in S are whitespace\n\
11303and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
11305static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011306unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 Py_ssize_t i, length;
11309 int kind;
11310 void *data;
11311
11312 if (PyUnicode_READY(self) == -1)
11313 return NULL;
11314 length = PyUnicode_GET_LENGTH(self);
11315 kind = PyUnicode_KIND(self);
11316 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (length == 1)
11320 return PyBool_FromLong(
11321 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011323 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011325 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 for (i = 0; i < length; i++) {
11328 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011329 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011330 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011332 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333}
11334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011335PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011337\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011338Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011339and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340
11341static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011342unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011343{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 Py_ssize_t i, length;
11345 int kind;
11346 void *data;
11347
11348 if (PyUnicode_READY(self) == -1)
11349 return NULL;
11350 length = PyUnicode_GET_LENGTH(self);
11351 kind = PyUnicode_KIND(self);
11352 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011353
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 if (length == 1)
11356 return PyBool_FromLong(
11357 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011358
11359 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 for (i = 0; i < length; i++) {
11364 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011366 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011367 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368}
11369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011370PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011373Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011375
11376static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011377unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 int kind;
11380 void *data;
11381 Py_ssize_t len, i;
11382
11383 if (PyUnicode_READY(self) == -1)
11384 return NULL;
11385
11386 kind = PyUnicode_KIND(self);
11387 data = PyUnicode_DATA(self);
11388 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011390 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 if (len == 1) {
11392 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11393 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11394 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395
11396 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 for (i = 0; i < len; i++) {
11401 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011402 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011405 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011406}
11407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011408PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011409 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011411Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
11414static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011415unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 Py_ssize_t i, length;
11418 int kind;
11419 void *data;
11420
11421 if (PyUnicode_READY(self) == -1)
11422 return NULL;
11423 length = PyUnicode_GET_LENGTH(self);
11424 kind = PyUnicode_KIND(self);
11425 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (length == 1)
11429 return PyBool_FromLong(
11430 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011432 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 for (i = 0; i < length; i++) {
11437 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011446Return True if all characters in S are digits\n\
11447and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011450unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 Py_ssize_t i, length;
11453 int kind;
11454 void *data;
11455
11456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 length = PyUnicode_GET_LENGTH(self);
11459 kind = PyUnicode_KIND(self);
11460 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 1) {
11464 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11465 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011468 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 for (i = 0; i < length; i++) {
11473 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011476 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477}
11478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011482Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
11485static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011486unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 Py_ssize_t i, length;
11489 int kind;
11490 void *data;
11491
11492 if (PyUnicode_READY(self) == -1)
11493 return NULL;
11494 length = PyUnicode_GET_LENGTH(self);
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (length == 1)
11500 return PyBool_FromLong(
11501 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011503 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 for (i = 0; i < length; i++) {
11508 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011511 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512}
11513
Martin v. Löwis47383402007-08-15 07:32:56 +000011514int
11515PyUnicode_IsIdentifier(PyObject *self)
11516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 int kind;
11518 void *data;
11519 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011520 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 if (PyUnicode_READY(self) == -1) {
11523 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011524 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 }
11526
11527 /* Special case for empty strings */
11528 if (PyUnicode_GET_LENGTH(self) == 0)
11529 return 0;
11530 kind = PyUnicode_KIND(self);
11531 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011532
11533 /* PEP 3131 says that the first character must be in
11534 XID_Start and subsequent characters in XID_Continue,
11535 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011536 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011537 letters, digits, underscore). However, given the current
11538 definition of XID_Start and XID_Continue, it is sufficient
11539 to check just for these, except that _ must be allowed
11540 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011542 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011543 return 0;
11544
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011545 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011548 return 1;
11549}
11550
11551PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011553\n\
11554Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011555to the language definition.\n\
11556\n\
11557Use keyword.iskeyword() to test for reserved identifiers\n\
11558such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011559
11560static PyObject*
11561unicode_isidentifier(PyObject *self)
11562{
11563 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11564}
11565
Georg Brandl559e5d72008-06-11 18:37:52 +000011566PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011568\n\
11569Return True if all characters in S are considered\n\
11570printable in repr() or S is empty, False otherwise.");
11571
11572static PyObject*
11573unicode_isprintable(PyObject *self)
11574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 Py_ssize_t i, length;
11576 int kind;
11577 void *data;
11578
11579 if (PyUnicode_READY(self) == -1)
11580 return NULL;
11581 length = PyUnicode_GET_LENGTH(self);
11582 kind = PyUnicode_KIND(self);
11583 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011584
11585 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 if (length == 1)
11587 return PyBool_FromLong(
11588 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 for (i = 0; i < length; i++) {
11591 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011592 Py_RETURN_FALSE;
11593 }
11594 }
11595 Py_RETURN_TRUE;
11596}
11597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011599 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600\n\
11601Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011602iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
11604static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011605unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011607 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608}
11609
Martin v. Löwis18e16552006-02-15 17:27:45 +000011610static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011611unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (PyUnicode_READY(self) == -1)
11614 return -1;
11615 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616}
11617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011618PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011621Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011622done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
11624static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011625unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011627 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 Py_UCS4 fillchar = ' ';
11629
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011630 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 return NULL;
11632
Benjamin Petersonbac79492012-01-14 13:34:47 -050011633 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635
Victor Stinnerc4b49542011-12-11 22:44:26 +010011636 if (PyUnicode_GET_LENGTH(self) >= width)
11637 return unicode_result_unchanged(self);
11638
11639 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640}
11641
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011642PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011643 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
11647static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011648unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011650 if (PyUnicode_READY(self) == -1)
11651 return NULL;
11652 if (PyUnicode_IS_ASCII(self))
11653 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011654 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655}
11656
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657#define LEFTSTRIP 0
11658#define RIGHTSTRIP 1
11659#define BOTHSTRIP 2
11660
11661/* Arrays indexed by above */
11662static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11663
11664#define STRIPNAME(i) (stripformat[i]+3)
11665
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666/* externally visible for str.strip(unicode) */
11667PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011668_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 void *data;
11671 int kind;
11672 Py_ssize_t i, j, len;
11673 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11676 return NULL;
11677
11678 kind = PyUnicode_KIND(self);
11679 data = PyUnicode_DATA(self);
11680 len = PyUnicode_GET_LENGTH(self);
11681 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11682 PyUnicode_DATA(sepobj),
11683 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011684
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 i = 0;
11686 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 while (i < len &&
11688 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 i++;
11690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011691 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692
Benjamin Peterson14339b62009-01-31 16:36:08 +000011693 j = len;
11694 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 do {
11696 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 } while (j >= i &&
11698 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701
Victor Stinner7931d9a2011-11-04 00:22:48 +010011702 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703}
11704
11705PyObject*
11706PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11707{
11708 unsigned char *data;
11709 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011710 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711
Victor Stinnerde636f32011-10-01 03:55:54 +020011712 if (PyUnicode_READY(self) == -1)
11713 return NULL;
11714
Victor Stinner684d5fd2012-05-03 02:32:34 +020011715 length = PyUnicode_GET_LENGTH(self);
11716 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011717
Victor Stinner684d5fd2012-05-03 02:32:34 +020011718 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011719 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720
Victor Stinnerde636f32011-10-01 03:55:54 +020011721 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011722 PyErr_SetString(PyExc_IndexError, "string index out of range");
11723 return NULL;
11724 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011725 if (start >= length || end < start)
11726 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011727
Victor Stinner684d5fd2012-05-03 02:32:34 +020011728 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011729 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011730 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011731 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011732 }
11733 else {
11734 kind = PyUnicode_KIND(self);
11735 data = PyUnicode_1BYTE_DATA(self);
11736 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011737 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011738 length);
11739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
11742static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011743do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 int kind;
11746 void *data;
11747 Py_ssize_t len, i, j;
11748
11749 if (PyUnicode_READY(self) == -1)
11750 return NULL;
11751
11752 kind = PyUnicode_KIND(self);
11753 data = PyUnicode_DATA(self);
11754 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 i = 0;
11757 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011759 i++;
11760 }
11761 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762
Benjamin Peterson14339b62009-01-31 16:36:08 +000011763 j = len;
11764 if (striptype != LEFTSTRIP) {
11765 do {
11766 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 j++;
11769 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770
Victor Stinner7931d9a2011-11-04 00:22:48 +010011771 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772}
11773
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774
11775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011776do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011777{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011778 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11781 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 if (sep != NULL && sep != Py_None) {
11784 if (PyUnicode_Check(sep))
11785 return _PyUnicode_XStrip(self, striptype, sep);
11786 else {
11787 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 "%s arg must be None or str",
11789 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 return NULL;
11791 }
11792 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011793
Benjamin Peterson14339b62009-01-31 16:36:08 +000011794 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795}
11796
11797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011798PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800\n\
11801Return a copy of the string S with leading and trailing\n\
11802whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011803If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804
11805static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011806unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 if (PyTuple_GET_SIZE(args) == 0)
11809 return do_strip(self, BOTHSTRIP); /* Common case */
11810 else
11811 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812}
11813
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817\n\
11818Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011819If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820
11821static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011822unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011823{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011824 if (PyTuple_GET_SIZE(args) == 0)
11825 return do_strip(self, LEFTSTRIP); /* Common case */
11826 else
11827 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828}
11829
11830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011831PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833\n\
11834Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011835If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836
11837static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011838unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011839{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011840 if (PyTuple_GET_SIZE(args) == 0)
11841 return do_strip(self, RIGHTSTRIP); /* Common case */
11842 else
11843 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844}
11845
11846
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011848unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011850 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Serhiy Storchaka05997252013-01-26 12:14:02 +020011853 if (len < 1)
11854 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
Victor Stinnerc4b49542011-12-11 22:44:26 +010011856 /* no repeat, return original string */
11857 if (len == 1)
11858 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011859
Benjamin Petersonbac79492012-01-14 13:34:47 -050011860 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 return NULL;
11862
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011863 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011864 PyErr_SetString(PyExc_OverflowError,
11865 "repeated string is too long");
11866 return NULL;
11867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011869
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011870 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 if (!u)
11872 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011873 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (PyUnicode_GET_LENGTH(str) == 1) {
11876 const int kind = PyUnicode_KIND(str);
11877 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011878 if (kind == PyUnicode_1BYTE_KIND) {
11879 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011880 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011881 }
11882 else if (kind == PyUnicode_2BYTE_KIND) {
11883 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011884 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011885 ucs2[n] = fill_char;
11886 } else {
11887 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11888 assert(kind == PyUnicode_4BYTE_KIND);
11889 for (n = 0; n < len; ++n)
11890 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 }
11893 else {
11894 /* number of characters copied this far */
11895 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011896 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 char *to = (char *) PyUnicode_DATA(u);
11898 Py_MEMCPY(to, PyUnicode_DATA(str),
11899 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 n = (done <= nchars-done) ? done : nchars-done;
11902 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011903 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 }
11906
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011907 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011908 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909}
11910
Alexander Belopolsky40018472011-02-26 01:02:56 +000011911PyObject *
11912PyUnicode_Replace(PyObject *obj,
11913 PyObject *subobj,
11914 PyObject *replobj,
11915 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
11917 PyObject *self;
11918 PyObject *str1;
11919 PyObject *str2;
11920 PyObject *result;
11921
11922 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011923 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011926 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 Py_DECREF(self);
11928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 }
11930 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011931 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 Py_DECREF(self);
11933 Py_DECREF(str1);
11934 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011936 if (PyUnicode_READY(self) == -1 ||
11937 PyUnicode_READY(str1) == -1 ||
11938 PyUnicode_READY(str2) == -1)
11939 result = NULL;
11940 else
11941 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 Py_DECREF(self);
11943 Py_DECREF(str1);
11944 Py_DECREF(str2);
11945 return result;
11946}
11947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011948PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011949 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950\n\
11951Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011952old replaced by new. If the optional argument count is\n\
11953given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
11955static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 PyObject *str1;
11959 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011960 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961 PyObject *result;
11962
Martin v. Löwis18e16552006-02-15 17:27:45 +000011963 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011965 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011968 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 return NULL;
11970 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011971 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 Py_DECREF(str1);
11973 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011974 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011975 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11976 result = NULL;
11977 else
11978 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
11980 Py_DECREF(str1);
11981 Py_DECREF(str2);
11982 return result;
11983}
11984
Alexander Belopolsky40018472011-02-26 01:02:56 +000011985static PyObject *
11986unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011988 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 Py_ssize_t isize;
11990 Py_ssize_t osize, squote, dquote, i, o;
11991 Py_UCS4 max, quote;
11992 int ikind, okind;
11993 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011996 return NULL;
11997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 isize = PyUnicode_GET_LENGTH(unicode);
11999 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 /* Compute length of output, quote characters, and
12002 maximum character */
12003 osize = 2; /* quotes */
12004 max = 127;
12005 squote = dquote = 0;
12006 ikind = PyUnicode_KIND(unicode);
12007 for (i = 0; i < isize; i++) {
12008 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012009 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012011 case '\'': squote++; break;
12012 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012014 incr = 2;
12015 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 default:
12017 /* Fast-path ASCII */
12018 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012019 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012021 ;
12022 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012025 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012027 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012029 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012031 if (osize > PY_SSIZE_T_MAX - incr) {
12032 PyErr_SetString(PyExc_OverflowError,
12033 "string is too long to generate repr");
12034 return NULL;
12035 }
12036 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 }
12038
12039 quote = '\'';
12040 if (squote) {
12041 if (dquote)
12042 /* Both squote and dquote present. Use squote,
12043 and escape them */
12044 osize += squote;
12045 else
12046 quote = '"';
12047 }
12048
12049 repr = PyUnicode_New(osize, max);
12050 if (repr == NULL)
12051 return NULL;
12052 okind = PyUnicode_KIND(repr);
12053 odata = PyUnicode_DATA(repr);
12054
12055 PyUnicode_WRITE(okind, odata, 0, quote);
12056 PyUnicode_WRITE(okind, odata, osize-1, quote);
12057
12058 for (i = 0, o = 1; i < isize; i++) {
12059 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012060
12061 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 if ((ch == quote) || (ch == '\\')) {
12063 PyUnicode_WRITE(okind, odata, o++, '\\');
12064 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012065 continue;
12066 }
12067
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012069 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 PyUnicode_WRITE(okind, odata, o++, '\\');
12071 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012072 }
12073 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 PyUnicode_WRITE(okind, odata, o++, '\\');
12075 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012076 }
12077 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 PyUnicode_WRITE(okind, odata, o++, '\\');
12079 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012080 }
12081
12082 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012083 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 PyUnicode_WRITE(okind, odata, o++, '\\');
12085 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012086 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12087 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012088 }
12089
Georg Brandl559e5d72008-06-11 18:37:52 +000012090 /* Copy ASCII characters as-is */
12091 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012093 }
12094
Benjamin Peterson29060642009-01-31 22:14:21 +000012095 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012096 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 (categories Z* and C* except ASCII space)
12099 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012101 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012102 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012105 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012107 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012108 /* Map 16-bit characters to '\uxxxx' */
12109 else if (ch <= 0xffff) {
12110 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012115 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012116 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012117 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012118 PyUnicode_WRITE(okind, odata, o++, 'U');
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012127 }
12128 }
12129 /* Copy characters as-is */
12130 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012132 }
12133 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012136 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012137 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138}
12139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012140PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142\n\
12143Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012144such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145arguments start and end are interpreted as in slice notation.\n\
12146\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012147Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
12149static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012152 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012153 Py_ssize_t start;
12154 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012155 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Jesus Ceaac451502011-04-20 17:09:23 +020012157 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12158 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (PyUnicode_READY(self) == -1)
12162 return NULL;
12163 if (PyUnicode_READY(substring) == -1)
12164 return NULL;
12165
Victor Stinner7931d9a2011-11-04 00:22:48 +010012166 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
12168 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 if (result == -2)
12171 return NULL;
12172
Christian Heimes217cfd12007-12-02 14:31:20 +000012173 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174}
12175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012176PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012179Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
12181static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012184 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012185 Py_ssize_t start;
12186 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Jesus Ceaac451502011-04-20 17:09:23 +020012189 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12190 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012191 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 if (PyUnicode_READY(self) == -1)
12194 return NULL;
12195 if (PyUnicode_READY(substring) == -1)
12196 return NULL;
12197
Victor Stinner7931d9a2011-11-04 00:22:48 +010012198 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
12200 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (result == -2)
12203 return NULL;
12204
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 if (result < 0) {
12206 PyErr_SetString(PyExc_ValueError, "substring not found");
12207 return NULL;
12208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209
Christian Heimes217cfd12007-12-02 14:31:20 +000012210 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211}
12212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012213PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012216Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012217done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
12219static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012220unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012222 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 Py_UCS4 fillchar = ' ';
12224
Victor Stinnere9a29352011-10-01 02:14:59 +020012225 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012227
Benjamin Petersonbac79492012-01-14 13:34:47 -050012228 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229 return NULL;
12230
Victor Stinnerc4b49542011-12-11 22:44:26 +010012231 if (PyUnicode_GET_LENGTH(self) >= width)
12232 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Victor Stinnerc4b49542011-12-11 22:44:26 +010012234 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235}
12236
Alexander Belopolsky40018472011-02-26 01:02:56 +000012237PyObject *
12238PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
12240 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012241
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 s = PyUnicode_FromObject(s);
12243 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012244 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 if (sep != NULL) {
12246 sep = PyUnicode_FromObject(sep);
12247 if (sep == NULL) {
12248 Py_DECREF(s);
12249 return NULL;
12250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 }
12252
Victor Stinner9310abb2011-10-05 00:59:23 +020012253 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
12255 Py_DECREF(s);
12256 Py_XDECREF(sep);
12257 return result;
12258}
12259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012260PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012261 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262\n\
12263Return a list of the words in S, using sep as the\n\
12264delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012265splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012266whitespace string is a separator and empty strings are\n\
12267removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
12269static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012270unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012272 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012274 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012276 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12277 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 return NULL;
12279
12280 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012283 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012285 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286}
12287
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288PyObject *
12289PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12290{
12291 PyObject* str_obj;
12292 PyObject* sep_obj;
12293 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 int kind1, kind2, kind;
12295 void *buf1 = NULL, *buf2 = NULL;
12296 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012297
12298 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012299 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012302 if (!sep_obj) {
12303 Py_DECREF(str_obj);
12304 return NULL;
12305 }
12306 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12307 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308 Py_DECREF(str_obj);
12309 return NULL;
12310 }
12311
Victor Stinner14f8f022011-10-05 20:58:25 +020012312 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012314 kind = Py_MAX(kind1, kind2);
12315 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012317 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (!buf1)
12319 goto onError;
12320 buf2 = PyUnicode_DATA(sep_obj);
12321 if (kind2 != kind)
12322 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12323 if (!buf2)
12324 goto onError;
12325 len1 = PyUnicode_GET_LENGTH(str_obj);
12326 len2 = PyUnicode_GET_LENGTH(sep_obj);
12327
Benjamin Petersonead6b532011-12-20 17:23:42 -060012328 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012330 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12331 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12332 else
12333 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 break;
12335 case PyUnicode_2BYTE_KIND:
12336 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12337 break;
12338 case PyUnicode_4BYTE_KIND:
12339 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12340 break;
12341 default:
12342 assert(0);
12343 out = 0;
12344 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345
12346 Py_DECREF(sep_obj);
12347 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 if (kind1 != kind)
12349 PyMem_Free(buf1);
12350 if (kind2 != kind)
12351 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352
12353 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 onError:
12355 Py_DECREF(sep_obj);
12356 Py_DECREF(str_obj);
12357 if (kind1 != kind && buf1)
12358 PyMem_Free(buf1);
12359 if (kind2 != kind && buf2)
12360 PyMem_Free(buf2);
12361 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362}
12363
12364
12365PyObject *
12366PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12367{
12368 PyObject* str_obj;
12369 PyObject* sep_obj;
12370 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 int kind1, kind2, kind;
12372 void *buf1 = NULL, *buf2 = NULL;
12373 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012374
12375 str_obj = PyUnicode_FromObject(str_in);
12376 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012378 sep_obj = PyUnicode_FromObject(sep_in);
12379 if (!sep_obj) {
12380 Py_DECREF(str_obj);
12381 return NULL;
12382 }
12383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 kind1 = PyUnicode_KIND(str_in);
12385 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012386 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 buf1 = PyUnicode_DATA(str_in);
12388 if (kind1 != kind)
12389 buf1 = _PyUnicode_AsKind(str_in, kind);
12390 if (!buf1)
12391 goto onError;
12392 buf2 = PyUnicode_DATA(sep_obj);
12393 if (kind2 != kind)
12394 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12395 if (!buf2)
12396 goto onError;
12397 len1 = PyUnicode_GET_LENGTH(str_obj);
12398 len2 = PyUnicode_GET_LENGTH(sep_obj);
12399
Benjamin Petersonead6b532011-12-20 17:23:42 -060012400 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012402 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12403 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12404 else
12405 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 break;
12407 case PyUnicode_2BYTE_KIND:
12408 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12409 break;
12410 case PyUnicode_4BYTE_KIND:
12411 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12412 break;
12413 default:
12414 assert(0);
12415 out = 0;
12416 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012417
12418 Py_DECREF(sep_obj);
12419 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 if (kind1 != kind)
12421 PyMem_Free(buf1);
12422 if (kind2 != kind)
12423 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012424
12425 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 onError:
12427 Py_DECREF(sep_obj);
12428 Py_DECREF(str_obj);
12429 if (kind1 != kind && buf1)
12430 PyMem_Free(buf1);
12431 if (kind2 != kind && buf2)
12432 PyMem_Free(buf2);
12433 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434}
12435
12436PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012439Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012441found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012442
12443static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012444unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445{
Victor Stinner9310abb2011-10-05 00:59:23 +020012446 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012447}
12448
12449PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012450 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012452Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012454separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012455
12456static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012457unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458{
Victor Stinner9310abb2011-10-05 00:59:23 +020012459 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460}
12461
Alexander Belopolsky40018472011-02-26 01:02:56 +000012462PyObject *
12463PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012464{
12465 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012467 s = PyUnicode_FromObject(s);
12468 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 if (sep != NULL) {
12471 sep = PyUnicode_FromObject(sep);
12472 if (sep == NULL) {
12473 Py_DECREF(s);
12474 return NULL;
12475 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012476 }
12477
Victor Stinner9310abb2011-10-05 00:59:23 +020012478 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012479
12480 Py_DECREF(s);
12481 Py_XDECREF(sep);
12482 return result;
12483}
12484
12485PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012486 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012487\n\
12488Return a list of the words in S, using sep as the\n\
12489delimiter string, starting at the end of the string and\n\
12490working to the front. If maxsplit is given, at most maxsplit\n\
12491splits are done. If sep is not specified, any whitespace string\n\
12492is a separator.");
12493
12494static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012495unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012496{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012497 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012498 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012499 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012501 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12502 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 return NULL;
12504
12505 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012508 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012509 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012510 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012511}
12512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012513PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515\n\
12516Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012517Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012518is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012521unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012523 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012524 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012526 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12527 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528 return NULL;
12529
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012530 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
12533static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012534PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537}
12538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012539PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541\n\
12542Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012543and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
12545static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012546unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012548 if (PyUnicode_READY(self) == -1)
12549 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012550 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551}
12552
Georg Brandlceee0772007-11-27 23:48:05 +000012553PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012555\n\
12556Return a translation table usable for str.translate().\n\
12557If there is only one argument, it must be a dictionary mapping Unicode\n\
12558ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012559Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012560If there are two arguments, they must be strings of equal length, and\n\
12561in the resulting dictionary, each character in x will be mapped to the\n\
12562character at the same position in y. If there is a third argument, it\n\
12563must be a string, whose characters will be mapped to None in the result.");
12564
12565static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012566unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012567{
12568 PyObject *x, *y = NULL, *z = NULL;
12569 PyObject *new = NULL, *key, *value;
12570 Py_ssize_t i = 0;
12571 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572
Georg Brandlceee0772007-11-27 23:48:05 +000012573 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12574 return NULL;
12575 new = PyDict_New();
12576 if (!new)
12577 return NULL;
12578 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 int x_kind, y_kind, z_kind;
12580 void *x_data, *y_data, *z_data;
12581
Georg Brandlceee0772007-11-27 23:48:05 +000012582 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012583 if (!PyUnicode_Check(x)) {
12584 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12585 "be a string if there is a second argument");
12586 goto err;
12587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012589 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12590 "arguments must have equal length");
12591 goto err;
12592 }
12593 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 x_kind = PyUnicode_KIND(x);
12595 y_kind = PyUnicode_KIND(y);
12596 x_data = PyUnicode_DATA(x);
12597 y_data = PyUnicode_DATA(y);
12598 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12599 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012600 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012601 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012602 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012603 if (!value) {
12604 Py_DECREF(key);
12605 goto err;
12606 }
Georg Brandlceee0772007-11-27 23:48:05 +000012607 res = PyDict_SetItem(new, key, value);
12608 Py_DECREF(key);
12609 Py_DECREF(value);
12610 if (res < 0)
12611 goto err;
12612 }
12613 /* create entries for deleting chars in z */
12614 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 z_kind = PyUnicode_KIND(z);
12616 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012617 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012619 if (!key)
12620 goto err;
12621 res = PyDict_SetItem(new, key, Py_None);
12622 Py_DECREF(key);
12623 if (res < 0)
12624 goto err;
12625 }
12626 }
12627 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 int kind;
12629 void *data;
12630
Georg Brandlceee0772007-11-27 23:48:05 +000012631 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012632 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012633 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12634 "to maketrans it must be a dict");
12635 goto err;
12636 }
12637 /* copy entries into the new dict, converting string keys to int keys */
12638 while (PyDict_Next(x, &i, &key, &value)) {
12639 if (PyUnicode_Check(key)) {
12640 /* convert string keys to integer keys */
12641 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012642 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012643 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12644 "table must be of length 1");
12645 goto err;
12646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 kind = PyUnicode_KIND(key);
12648 data = PyUnicode_DATA(key);
12649 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012650 if (!newkey)
12651 goto err;
12652 res = PyDict_SetItem(new, newkey, value);
12653 Py_DECREF(newkey);
12654 if (res < 0)
12655 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012656 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012657 /* just keep integer keys */
12658 if (PyDict_SetItem(new, key, value) < 0)
12659 goto err;
12660 } else {
12661 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12662 "be strings or integers");
12663 goto err;
12664 }
12665 }
12666 }
12667 return new;
12668 err:
12669 Py_DECREF(new);
12670 return NULL;
12671}
12672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012673PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675\n\
12676Return a copy of the string S, where all characters have been mapped\n\
12677through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012678Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012679Unmapped characters are left untouched. Characters mapped to None\n\
12680are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681
12682static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686}
12687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012688PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012691Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
12693static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012694unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012696 if (PyUnicode_READY(self) == -1)
12697 return NULL;
12698 if (PyUnicode_IS_ASCII(self))
12699 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012700 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701}
12702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012703PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012706Pad a numeric string S with zeros on the left, to fill a field\n\
12707of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708
12709static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012710unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012712 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012713 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012714 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 int kind;
12716 void *data;
12717 Py_UCS4 chr;
12718
Martin v. Löwis18e16552006-02-15 17:27:45 +000012719 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 return NULL;
12721
Benjamin Petersonbac79492012-01-14 13:34:47 -050012722 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724
Victor Stinnerc4b49542011-12-11 22:44:26 +010012725 if (PyUnicode_GET_LENGTH(self) >= width)
12726 return unicode_result_unchanged(self);
12727
12728 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
12730 u = pad(self, fill, 0, '0');
12731
Walter Dörwald068325e2002-04-15 13:36:47 +000012732 if (u == NULL)
12733 return NULL;
12734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 kind = PyUnicode_KIND(u);
12736 data = PyUnicode_DATA(u);
12737 chr = PyUnicode_READ(kind, data, fill);
12738
12739 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 PyUnicode_WRITE(kind, data, 0, chr);
12742 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743 }
12744
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012745 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012746 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748
12749#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012750static PyObject *
12751unicode__decimal2ascii(PyObject *self)
12752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012754}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755#endif
12756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012757PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012760Return True if S starts with the specified prefix, False otherwise.\n\
12761With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012762With optional end, stop comparing S at that position.\n\
12763prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012766unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012769 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012771 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012772 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012773 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774
Jesus Ceaac451502011-04-20 17:09:23 +020012775 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012776 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 if (PyTuple_Check(subobj)) {
12778 Py_ssize_t i;
12779 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012780 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 if (substring == NULL)
12782 return NULL;
12783 result = tailmatch(self, substring, start, end, -1);
12784 Py_DECREF(substring);
12785 if (result) {
12786 Py_RETURN_TRUE;
12787 }
12788 }
12789 /* nothing matched */
12790 Py_RETURN_FALSE;
12791 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012792 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012793 if (substring == NULL) {
12794 if (PyErr_ExceptionMatches(PyExc_TypeError))
12795 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12796 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012798 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012799 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012801 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802}
12803
12804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012805PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012808Return True if S ends with the specified suffix, False otherwise.\n\
12809With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012810With optional end, stop comparing S at that position.\n\
12811suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
12813static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012814unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012817 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012818 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012819 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012820 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012821 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
Jesus Ceaac451502011-04-20 17:09:23 +020012823 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012825 if (PyTuple_Check(subobj)) {
12826 Py_ssize_t i;
12827 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012828 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012832 result = tailmatch(self, substring, start, end, +1);
12833 Py_DECREF(substring);
12834 if (result) {
12835 Py_RETURN_TRUE;
12836 }
12837 }
12838 Py_RETURN_FALSE;
12839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012840 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012841 if (substring == NULL) {
12842 if (PyErr_ExceptionMatches(PyExc_TypeError))
12843 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12844 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012846 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012847 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012849 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850}
12851
Victor Stinner202fdca2012-05-07 12:47:02 +020012852Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012853_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012854{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012855 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012856 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12857 writer->data = PyUnicode_DATA(writer->buffer);
12858 writer->kind = PyUnicode_KIND(writer->buffer);
12859}
12860
Victor Stinnerd3f08822012-05-29 12:57:52 +020012861void
12862_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012863{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012864 memset(writer, 0, sizeof(*writer));
12865#ifdef Py_DEBUG
12866 writer->kind = 5; /* invalid kind */
12867#endif
12868 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012869 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012870}
12871
Victor Stinnerd3f08822012-05-29 12:57:52 +020012872int
12873_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12874 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012875{
12876 Py_ssize_t newlen;
12877 PyObject *newbuffer;
12878
Victor Stinnerd3f08822012-05-29 12:57:52 +020012879 assert(length > 0);
12880
Victor Stinner202fdca2012-05-07 12:47:02 +020012881 if (length > PY_SSIZE_T_MAX - writer->pos) {
12882 PyErr_NoMemory();
12883 return -1;
12884 }
12885 newlen = writer->pos + length;
12886
Victor Stinnerd3f08822012-05-29 12:57:52 +020012887 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012888 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012889 /* overallocate 25% to limit the number of resize */
12890 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12891 newlen += newlen / 4;
12892 if (newlen < writer->min_length)
12893 newlen = writer->min_length;
12894 }
12895 writer->buffer = PyUnicode_New(newlen, maxchar);
12896 if (writer->buffer == NULL)
12897 return -1;
12898 _PyUnicodeWriter_Update(writer);
12899 return 0;
12900 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012901
Victor Stinnerd3f08822012-05-29 12:57:52 +020012902 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012903 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012904 /* overallocate 25% to limit the number of resize */
12905 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12906 newlen += newlen / 4;
12907 if (newlen < writer->min_length)
12908 newlen = writer->min_length;
12909 }
12910
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012911 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012912 /* resize + widen */
12913 newbuffer = PyUnicode_New(newlen, maxchar);
12914 if (newbuffer == NULL)
12915 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012916 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12917 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012918 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012919 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012920 }
12921 else {
12922 newbuffer = resize_compact(writer->buffer, newlen);
12923 if (newbuffer == NULL)
12924 return -1;
12925 }
12926 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012927 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012928 }
12929 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012930 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012931 newbuffer = PyUnicode_New(writer->size, maxchar);
12932 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012933 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012934 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12935 writer->buffer, 0, writer->pos);
12936 Py_DECREF(writer->buffer);
12937 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012938 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012939 }
12940 return 0;
12941}
12942
Victor Stinnerd3f08822012-05-29 12:57:52 +020012943int
12944_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12945{
12946 Py_UCS4 maxchar;
12947 Py_ssize_t len;
12948
12949 if (PyUnicode_READY(str) == -1)
12950 return -1;
12951 len = PyUnicode_GET_LENGTH(str);
12952 if (len == 0)
12953 return 0;
12954 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12955 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012956 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012957 Py_INCREF(str);
12958 writer->buffer = str;
12959 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012960 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012961 writer->size = 0;
12962 writer->pos += len;
12963 return 0;
12964 }
12965 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12966 return -1;
12967 }
12968 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12969 str, 0, len);
12970 writer->pos += len;
12971 return 0;
12972}
12973
12974PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012975_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012976{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012977 if (writer->pos == 0) {
12978 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012979 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012980 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012981 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012982 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12983 return writer->buffer;
12984 }
12985 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12986 PyObject *newbuffer;
12987 newbuffer = resize_compact(writer->buffer, writer->pos);
12988 if (newbuffer == NULL) {
12989 Py_DECREF(writer->buffer);
12990 return NULL;
12991 }
12992 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012993 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012994 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010012995 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012996}
12997
Victor Stinnerd3f08822012-05-29 12:57:52 +020012998void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012999_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013000{
13001 Py_CLEAR(writer->buffer);
13002}
13003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013005
13006PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013008\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013009Return a formatted version of S, using substitutions from args and kwargs.\n\
13010The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013011
Eric Smith27bbca62010-11-04 17:06:58 +000013012PyDoc_STRVAR(format_map__doc__,
13013 "S.format_map(mapping) -> str\n\
13014\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013015Return a formatted version of S, using substitutions from mapping.\n\
13016The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013017
Eric Smith4a7d76d2008-05-30 18:10:19 +000013018static PyObject *
13019unicode__format__(PyObject* self, PyObject* args)
13020{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013021 PyObject *format_spec;
13022 _PyUnicodeWriter writer;
13023 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013024
13025 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13026 return NULL;
13027
Victor Stinnerd3f08822012-05-29 12:57:52 +020013028 if (PyUnicode_READY(self) == -1)
13029 return NULL;
13030 _PyUnicodeWriter_Init(&writer, 0);
13031 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13032 self, format_spec, 0,
13033 PyUnicode_GET_LENGTH(format_spec));
13034 if (ret == -1) {
13035 _PyUnicodeWriter_Dealloc(&writer);
13036 return NULL;
13037 }
13038 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013039}
13040
Eric Smith8c663262007-08-25 02:26:07 +000013041PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013043\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013044Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013045
13046static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013047unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049 Py_ssize_t size;
13050
13051 /* If it's a compact object, account for base structure +
13052 character data. */
13053 if (PyUnicode_IS_COMPACT_ASCII(v))
13054 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13055 else if (PyUnicode_IS_COMPACT(v))
13056 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013057 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 else {
13059 /* If it is a two-block object, account for base object, and
13060 for character block if present. */
13061 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013062 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013064 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 }
13066 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013067 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013068 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013070 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013071 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072
13073 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013074}
13075
13076PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013078
13079static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013080unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013081{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013082 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 if (!copy)
13084 return NULL;
13085 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013086}
13087
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013089 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013090 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013091 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13092 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013093 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13094 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013095 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013096 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13097 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13098 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13099 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13100 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013101 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013102 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13103 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13104 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013105 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013106 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13107 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13108 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013109 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013110 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013111 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013112 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013113 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13114 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13115 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13116 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13117 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13118 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13119 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13120 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13121 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13122 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13123 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13124 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13125 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13126 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013127 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013128 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013129 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013130 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013131 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013132 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013133 {"maketrans", (PyCFunction) unicode_maketrans,
13134 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013135 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013136#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013137 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013138 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139#endif
13140
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 {NULL, NULL}
13143};
13144
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013145static PyObject *
13146unicode_mod(PyObject *v, PyObject *w)
13147{
Brian Curtindfc80e32011-08-10 20:28:54 -050013148 if (!PyUnicode_Check(v))
13149 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013151}
13152
13153static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 0, /*nb_add*/
13155 0, /*nb_subtract*/
13156 0, /*nb_multiply*/
13157 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013158};
13159
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161 (lenfunc) unicode_length, /* sq_length */
13162 PyUnicode_Concat, /* sq_concat */
13163 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13164 (ssizeargfunc) unicode_getitem, /* sq_item */
13165 0, /* sq_slice */
13166 0, /* sq_ass_item */
13167 0, /* sq_ass_slice */
13168 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169};
13170
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013171static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013172unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174 if (PyUnicode_READY(self) == -1)
13175 return NULL;
13176
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013177 if (PyIndex_Check(item)) {
13178 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013179 if (i == -1 && PyErr_Occurred())
13180 return NULL;
13181 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013182 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013183 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013184 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013185 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013186 PyObject *result;
13187 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013188 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013189 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013193 return NULL;
13194 }
13195
13196 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013197 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013199 slicelength == PyUnicode_GET_LENGTH(self)) {
13200 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013201 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013202 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013203 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013204 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013205 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013206 src_kind = PyUnicode_KIND(self);
13207 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013208 if (!PyUnicode_IS_ASCII(self)) {
13209 kind_limit = kind_maxchar_limit(src_kind);
13210 max_char = 0;
13211 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13212 ch = PyUnicode_READ(src_kind, src_data, cur);
13213 if (ch > max_char) {
13214 max_char = ch;
13215 if (max_char >= kind_limit)
13216 break;
13217 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013218 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013219 }
Victor Stinner55c99112011-10-13 01:17:06 +020013220 else
13221 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013222 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013223 if (result == NULL)
13224 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013225 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013226 dest_data = PyUnicode_DATA(result);
13227
13228 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013229 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13230 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013231 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013232 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013233 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013234 } else {
13235 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13236 return NULL;
13237 }
13238}
13239
13240static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013241 (lenfunc)unicode_length, /* mp_length */
13242 (binaryfunc)unicode_subscript, /* mp_subscript */
13243 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013244};
13245
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247/* Helpers for PyUnicode_Format() */
13248
13249static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013250getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013252 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013254 (*p_argidx)++;
13255 if (arglen < 0)
13256 return args;
13257 else
13258 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259 }
13260 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013261 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262 return NULL;
13263}
13264
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013265/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
Victor Stinnerd3f08822012-05-29 12:57:52 +020013267static int
13268formatfloat(PyObject *v, int flags, int prec, int type,
13269 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013271 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013272 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013273 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013274
Guido van Rossumd57fd912000-03-10 22:53:23 +000013275 x = PyFloat_AsDouble(v);
13276 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013277 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013278
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013281
Eric Smith0923d1d2009-04-16 20:16:10 +000013282 p = PyOS_double_to_string(x, type, prec,
13283 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013284 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013285 return -1;
13286 len = strlen(p);
13287 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013288 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13289 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013290 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013291 }
Victor Stinner184252a2012-06-16 02:57:41 +020013292 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013293 writer->pos += len;
13294 }
13295 else
13296 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013297 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013298 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299}
13300
Victor Stinnerd0880d52012-04-27 23:40:13 +020013301/* formatlong() emulates the format codes d, u, o, x and X, and
13302 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13303 * Python's regular ints.
13304 * Return value: a new PyUnicodeObject*, or NULL if error.
13305 * The output string is of the form
13306 * "-"? ("0x" | "0X")? digit+
13307 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13308 * set in flags. The case of hex digits will be correct,
13309 * There will be at least prec digits, zero-filled on the left if
13310 * necessary to get that many.
13311 * val object to be converted
13312 * flags bitmask of format flags; only F_ALT is looked at
13313 * prec minimum number of digits; 0-fill on left if needed
13314 * type a character in [duoxX]; u acts the same as d
13315 *
13316 * CAUTION: o, x and X conversions on regular ints can never
13317 * produce a '-' sign, but can for Python's unbounded ints.
13318 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013319static PyObject*
13320formatlong(PyObject *val, int flags, int prec, int type)
13321{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013322 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013323 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013324 Py_ssize_t i;
13325 int sign; /* 1 if '-', else 0 */
13326 int len; /* number of characters */
13327 Py_ssize_t llen;
13328 int numdigits; /* len == numnondigits + numdigits */
13329 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013330
Victor Stinnerd0880d52012-04-27 23:40:13 +020013331 /* Avoid exceeding SSIZE_T_MAX */
13332 if (prec > INT_MAX-3) {
13333 PyErr_SetString(PyExc_OverflowError,
13334 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013335 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013336 }
13337
13338 assert(PyLong_Check(val));
13339
13340 switch (type) {
13341 case 'd':
13342 case 'u':
13343 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013344 if (PyBool_Check(val))
13345 result = PyNumber_ToBase(val, 10);
13346 else
13347 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013348 break;
13349 case 'o':
13350 numnondigits = 2;
13351 result = PyNumber_ToBase(val, 8);
13352 break;
13353 case 'x':
13354 case 'X':
13355 numnondigits = 2;
13356 result = PyNumber_ToBase(val, 16);
13357 break;
13358 default:
13359 assert(!"'type' not in [duoxX]");
13360 }
13361 if (!result)
13362 return NULL;
13363
13364 assert(unicode_modifiable(result));
13365 assert(PyUnicode_IS_READY(result));
13366 assert(PyUnicode_IS_ASCII(result));
13367
13368 /* To modify the string in-place, there can only be one reference. */
13369 if (Py_REFCNT(result) != 1) {
13370 PyErr_BadInternalCall();
13371 return NULL;
13372 }
13373 buf = PyUnicode_DATA(result);
13374 llen = PyUnicode_GET_LENGTH(result);
13375 if (llen > INT_MAX) {
13376 PyErr_SetString(PyExc_ValueError,
13377 "string too large in _PyBytes_FormatLong");
13378 return NULL;
13379 }
13380 len = (int)llen;
13381 sign = buf[0] == '-';
13382 numnondigits += sign;
13383 numdigits = len - numnondigits;
13384 assert(numdigits > 0);
13385
13386 /* Get rid of base marker unless F_ALT */
13387 if (((flags & F_ALT) == 0 &&
13388 (type == 'o' || type == 'x' || type == 'X'))) {
13389 assert(buf[sign] == '0');
13390 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13391 buf[sign+1] == 'o');
13392 numnondigits -= 2;
13393 buf += 2;
13394 len -= 2;
13395 if (sign)
13396 buf[0] = '-';
13397 assert(len == numnondigits + numdigits);
13398 assert(numdigits > 0);
13399 }
13400
13401 /* Fill with leading zeroes to meet minimum width. */
13402 if (prec > numdigits) {
13403 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13404 numnondigits + prec);
13405 char *b1;
13406 if (!r1) {
13407 Py_DECREF(result);
13408 return NULL;
13409 }
13410 b1 = PyBytes_AS_STRING(r1);
13411 for (i = 0; i < numnondigits; ++i)
13412 *b1++ = *buf++;
13413 for (i = 0; i < prec - numdigits; i++)
13414 *b1++ = '0';
13415 for (i = 0; i < numdigits; i++)
13416 *b1++ = *buf++;
13417 *b1 = '\0';
13418 Py_DECREF(result);
13419 result = r1;
13420 buf = PyBytes_AS_STRING(result);
13421 len = numnondigits + prec;
13422 }
13423
13424 /* Fix up case for hex conversions. */
13425 if (type == 'X') {
13426 /* Need to convert all lower case letters to upper case.
13427 and need to convert 0x to 0X (and -0x to -0X). */
13428 for (i = 0; i < len; i++)
13429 if (buf[i] >= 'a' && buf[i] <= 'x')
13430 buf[i] -= 'a'-'A';
13431 }
13432 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13433 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013434 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013435 Py_DECREF(result);
13436 result = unicode;
13437 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013439}
13440
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013441static Py_UCS4
13442formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013444 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013445 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013447 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 goto onError;
13450 }
13451 else {
13452 /* Integer input truncated to a character */
13453 long x;
13454 x = PyLong_AsLong(v);
13455 if (x == -1 && PyErr_Occurred())
13456 goto onError;
13457
Victor Stinner8faf8212011-12-08 22:14:11 +010013458 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 PyErr_SetString(PyExc_OverflowError,
13460 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013461 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 }
13463
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013464 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013465 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013466
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013468 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013470 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013471}
13472
Alexander Belopolsky40018472011-02-26 01:02:56 +000013473PyObject *
13474PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013476 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013477 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013478 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013479 PyObject *temp = NULL;
13480 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013481 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013482 void *fmt;
13483 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013484 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013485 Py_ssize_t sublen;
13486 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013487
Guido van Rossumd57fd912000-03-10 22:53:23 +000013488 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 PyErr_BadInternalCall();
13490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013491 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013492 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013493 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013495 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013496 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013497 return NULL;
13498 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 fmt = PyUnicode_DATA(uformat);
13501 fmtkind = PyUnicode_KIND(uformat);
13502 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13503 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504
Victor Stinnerd3f08822012-05-29 12:57:52 +020013505 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013506
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013508 arglen = PyTuple_Size(args);
13509 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510 }
13511 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 arglen = -1;
13513 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013514 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013515 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013517
13518 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013520 Py_ssize_t nonfmtpos;
13521 nonfmtpos = fmtpos++;
13522 while (fmtcnt >= 0 &&
13523 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13524 fmtpos++;
13525 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013526 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013527 if (fmtcnt < 0)
13528 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013529 sublen = fmtpos - nonfmtpos;
13530 maxchar = _PyUnicode_FindMaxChar(uformat,
13531 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013532 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013533 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013534
Victor Stinnerd3f08822012-05-29 12:57:52 +020013535 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13536 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013537 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013538 }
13539 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 /* Got a format specifier */
13541 int flags = 0;
13542 Py_ssize_t width = -1;
13543 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013544 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013545 Py_UCS4 fill;
13546 int sign;
13547 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 int isnumok;
13549 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013550 void *pbuf = NULL;
13551 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013552 Py_UCS4 bufmaxchar;
13553 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013556 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13557 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 Py_ssize_t keylen;
13560 PyObject *key;
13561 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013562
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 if (dict == NULL) {
13564 PyErr_SetString(PyExc_TypeError,
13565 "format requires a mapping");
13566 goto onError;
13567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013568 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013570 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 /* Skip over balanced parentheses */
13572 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013573 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13574 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013576 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013578 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 if (fmtcnt < 0 || pcount > 0) {
13582 PyErr_SetString(PyExc_ValueError,
13583 "incomplete format key");
13584 goto onError;
13585 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013586 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013587 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 if (key == NULL)
13589 goto onError;
13590 if (args_owned) {
13591 Py_DECREF(args);
13592 args_owned = 0;
13593 }
13594 args = PyObject_GetItem(dict, key);
13595 Py_DECREF(key);
13596 if (args == NULL) {
13597 goto onError;
13598 }
13599 args_owned = 1;
13600 arglen = -1;
13601 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013602 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013604 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13605 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 case '-': flags |= F_LJUST; continue;
13607 case '+': flags |= F_SIGN; continue;
13608 case ' ': flags |= F_BLANK; continue;
13609 case '#': flags |= F_ALT; continue;
13610 case '0': flags |= F_ZERO; continue;
13611 }
13612 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013613 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 if (c == '*') {
13615 v = getnextarg(args, arglen, &argidx);
13616 if (v == NULL)
13617 goto onError;
13618 if (!PyLong_Check(v)) {
13619 PyErr_SetString(PyExc_TypeError,
13620 "* wants int");
13621 goto onError;
13622 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013623 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 if (width == -1 && PyErr_Occurred())
13625 goto onError;
13626 if (width < 0) {
13627 flags |= F_LJUST;
13628 width = -width;
13629 }
13630 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013632 }
13633 else if (c >= '0' && c <= '9') {
13634 width = c - '0';
13635 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013636 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 if (c < '0' || c > '9')
13638 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013639 /* Since c is unsigned, the RHS would end up as unsigned,
13640 mixing signed and unsigned comparison. Since c is between
13641 '0' and '9', casting to int is safe. */
13642 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 PyErr_SetString(PyExc_ValueError,
13644 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013645 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 }
13647 width = width*10 + (c - '0');
13648 }
13649 }
13650 if (c == '.') {
13651 prec = 0;
13652 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 if (c == '*') {
13655 v = getnextarg(args, arglen, &argidx);
13656 if (v == NULL)
13657 goto onError;
13658 if (!PyLong_Check(v)) {
13659 PyErr_SetString(PyExc_TypeError,
13660 "* wants int");
13661 goto onError;
13662 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013663 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 if (prec == -1 && PyErr_Occurred())
13665 goto onError;
13666 if (prec < 0)
13667 prec = 0;
13668 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 }
13671 else if (c >= '0' && c <= '9') {
13672 prec = c - '0';
13673 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013675 if (c < '0' || c > '9')
13676 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013677 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 PyErr_SetString(PyExc_ValueError,
13679 "prec too big");
13680 goto onError;
13681 }
13682 prec = prec*10 + (c - '0');
13683 }
13684 }
13685 } /* prec */
13686 if (fmtcnt >= 0) {
13687 if (c == 'h' || c == 'l' || c == 'L') {
13688 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013689 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 }
13691 }
13692 if (fmtcnt < 0) {
13693 PyErr_SetString(PyExc_ValueError,
13694 "incomplete format");
13695 goto onError;
13696 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013697 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013698 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013699
13700 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013701 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013702 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013703 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13704 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013705 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013706 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013707
Victor Stinneraff3cc62012-04-30 05:19:21 +020013708 v = getnextarg(args, arglen, &argidx);
13709 if (v == NULL)
13710 goto onError;
13711
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013713 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 fill = ' ';
13715 switch (c) {
13716
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 case 's':
13718 case 'r':
13719 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013720 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13721 /* Fast path */
13722 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13723 goto onError;
13724 goto nextarg;
13725 }
13726
Victor Stinner808fc0a2010-03-22 12:50:40 +000013727 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 temp = v;
13729 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013730 }
13731 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 if (c == 's')
13733 temp = PyObject_Str(v);
13734 else if (c == 'r')
13735 temp = PyObject_Repr(v);
13736 else
13737 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013739 break;
13740
13741 case 'i':
13742 case 'd':
13743 case 'u':
13744 case 'o':
13745 case 'x':
13746 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013747 if (PyLong_CheckExact(v)
13748 && width == -1 && prec == -1
13749 && !(flags & (F_SIGN | F_BLANK)))
13750 {
13751 /* Fast path */
13752 switch(c)
13753 {
13754 case 'd':
13755 case 'i':
13756 case 'u':
13757 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13758 goto onError;
13759 goto nextarg;
13760 case 'x':
13761 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13762 goto onError;
13763 goto nextarg;
13764 case 'o':
13765 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13766 goto onError;
13767 goto nextarg;
13768 default:
13769 break;
13770 }
13771 }
13772
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 isnumok = 0;
13774 if (PyNumber_Check(v)) {
13775 PyObject *iobj=NULL;
13776
13777 if (PyLong_Check(v)) {
13778 iobj = v;
13779 Py_INCREF(iobj);
13780 }
13781 else {
13782 iobj = PyNumber_Long(v);
13783 }
13784 if (iobj!=NULL) {
13785 if (PyLong_Check(iobj)) {
13786 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013787 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013788 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013789 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 }
13791 else {
13792 Py_DECREF(iobj);
13793 }
13794 }
13795 }
13796 if (!isnumok) {
13797 PyErr_Format(PyExc_TypeError,
13798 "%%%c format: a number is required, "
13799 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13800 goto onError;
13801 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013802 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013803 fill = '0';
13804 break;
13805
13806 case 'e':
13807 case 'E':
13808 case 'f':
13809 case 'F':
13810 case 'g':
13811 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013812 if (width == -1 && prec == -1
13813 && !(flags & (F_SIGN | F_BLANK)))
13814 {
13815 /* Fast path */
13816 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13817 goto onError;
13818 goto nextarg;
13819 }
13820
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013822 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013824 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13825 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013826 break;
13827
13828 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013829 {
13830 Py_UCS4 ch = formatchar(v);
13831 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013832 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013833 if (width == -1 && prec == -1) {
13834 /* Fast path */
13835 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13836 goto onError;
13837 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13838 writer.pos += 1;
13839 goto nextarg;
13840 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013841 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013843 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013844
13845 default:
13846 PyErr_Format(PyExc_ValueError,
13847 "unsupported format character '%c' (0x%x) "
13848 "at index %zd",
13849 (31<=c && c<=126) ? (char)c : '?',
13850 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013851 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013852 goto onError;
13853 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013854 if (temp == NULL)
13855 goto onError;
13856 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857
13858 if (width == -1 && prec == -1
13859 && !(flags & (F_SIGN | F_BLANK)))
13860 {
13861 /* Fast path */
13862 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13863 goto onError;
13864 goto nextarg;
13865 }
13866
Victor Stinneraff3cc62012-04-30 05:19:21 +020013867 if (PyUnicode_READY(temp) == -1) {
13868 Py_CLEAR(temp);
13869 goto onError;
13870 }
13871 kind = PyUnicode_KIND(temp);
13872 pbuf = PyUnicode_DATA(temp);
13873 len = PyUnicode_GET_LENGTH(temp);
13874
13875 if (c == 's' || c == 'r' || c == 'a') {
13876 if (prec >= 0 && len > prec)
13877 len = prec;
13878 }
13879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013880 /* pbuf is initialized here. */
13881 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013883 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13884 if (ch == '-' || ch == '+') {
13885 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013886 len--;
13887 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 }
13889 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013890 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013891 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013892 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 else
13894 sign = 0;
13895 }
13896 if (width < len)
13897 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013898
13899 /* Compute the length and maximum character of the
13900 written characters */
13901 bufmaxchar = 127;
13902 if (!(flags & F_LJUST)) {
13903 if (sign) {
13904 if ((width-1) > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013905 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013906 }
13907 else {
13908 if (width > len)
Benjamin Peterson7e303732013-06-10 09:19:46 -070013909 bufmaxchar = Py_MAX(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013910 }
13911 }
13912 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Benjamin Peterson7e303732013-06-10 09:19:46 -070013913 bufmaxchar = Py_MAX(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013914
13915 buflen = width;
13916 if (sign && len == width)
13917 buflen++;
13918
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013919 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013920 goto onError;
13921
13922 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013923 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013924 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013925 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13926 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013927 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 if (width > len)
13929 width--;
13930 }
13931 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013932 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013933 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013935 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13936 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13937 writer.pos += 2;
13938 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 width -= 2;
13941 if (width < 0)
13942 width = 0;
13943 len -= 2;
13944 }
13945 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013946 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013947 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13948 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013949 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013950 }
13951 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013952 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013953 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13954 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013955 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013956 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013957 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13958 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013959 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13960 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13961 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013962 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013963 }
13964 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013965
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013966 if (len) {
13967 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13968 temp, pindex, len);
13969 writer.pos += len;
13970 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013971 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013972 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013973 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13974 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013975 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013976
Victor Stinnerd3f08822012-05-29 12:57:52 +020013977nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013978 if (dict && (argidx < arglen) && c != '%') {
13979 PyErr_SetString(PyExc_TypeError,
13980 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013981 goto onError;
13982 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013983 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013984 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013985 } /* until end */
13986 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013987 PyErr_SetString(PyExc_TypeError,
13988 "not all arguments converted during string formatting");
13989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990 }
13991
13992 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013993 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994 }
13995 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013996 Py_XDECREF(temp);
13997 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013998 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014002 Py_XDECREF(temp);
14003 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014004 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014006 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014007 }
14008 return NULL;
14009}
14010
Jeremy Hylton938ace62002-07-17 16:30:39 +000014011static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014012unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14013
Tim Peters6d6c1a32001-08-02 04:15:00 +000014014static PyObject *
14015unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14016{
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 static char *kwlist[] = {"object", "encoding", "errors", 0};
14019 char *encoding = NULL;
14020 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014021
Benjamin Peterson14339b62009-01-31 16:36:08 +000014022 if (type != &PyUnicode_Type)
14023 return unicode_subtype_new(type, args, kwds);
14024 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014025 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 return NULL;
14027 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014028 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 if (encoding == NULL && errors == NULL)
14030 return PyObject_Str(x);
14031 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014033}
14034
Guido van Rossume023fe02001-08-30 03:12:59 +000014035static PyObject *
14036unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14037{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014038 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014039 Py_ssize_t length, char_size;
14040 int share_wstr, share_utf8;
14041 unsigned int kind;
14042 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014043
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014045
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014046 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014047 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014048 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014049 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014050 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014051 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014053 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014054
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014055 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056 if (self == NULL) {
14057 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014058 return NULL;
14059 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014060 kind = PyUnicode_KIND(unicode);
14061 length = PyUnicode_GET_LENGTH(unicode);
14062
14063 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014064#ifdef Py_DEBUG
14065 _PyUnicode_HASH(self) = -1;
14066#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014067 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014068#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014069 _PyUnicode_STATE(self).interned = 0;
14070 _PyUnicode_STATE(self).kind = kind;
14071 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014072 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014073 _PyUnicode_STATE(self).ready = 1;
14074 _PyUnicode_WSTR(self) = NULL;
14075 _PyUnicode_UTF8_LENGTH(self) = 0;
14076 _PyUnicode_UTF8(self) = NULL;
14077 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014078 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014079
14080 share_utf8 = 0;
14081 share_wstr = 0;
14082 if (kind == PyUnicode_1BYTE_KIND) {
14083 char_size = 1;
14084 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14085 share_utf8 = 1;
14086 }
14087 else if (kind == PyUnicode_2BYTE_KIND) {
14088 char_size = 2;
14089 if (sizeof(wchar_t) == 2)
14090 share_wstr = 1;
14091 }
14092 else {
14093 assert(kind == PyUnicode_4BYTE_KIND);
14094 char_size = 4;
14095 if (sizeof(wchar_t) == 4)
14096 share_wstr = 1;
14097 }
14098
14099 /* Ensure we won't overflow the length. */
14100 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14101 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014102 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014104 data = PyObject_MALLOC((length + 1) * char_size);
14105 if (data == NULL) {
14106 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014107 goto onError;
14108 }
14109
Victor Stinnerc3c74152011-10-02 20:39:55 +020014110 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014111 if (share_utf8) {
14112 _PyUnicode_UTF8_LENGTH(self) = length;
14113 _PyUnicode_UTF8(self) = data;
14114 }
14115 if (share_wstr) {
14116 _PyUnicode_WSTR_LENGTH(self) = length;
14117 _PyUnicode_WSTR(self) = (wchar_t *)data;
14118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014119
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014120 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014121 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014122 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014123#ifdef Py_DEBUG
14124 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14125#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014126 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014127 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014128
14129onError:
14130 Py_DECREF(unicode);
14131 Py_DECREF(self);
14132 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014133}
14134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014135PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014136"str(object='') -> str\n\
14137str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014138\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014139Create a new string object from the given object. If encoding or\n\
14140errors is specified, then the object must expose a data buffer\n\
14141that will be decoded using the given encoding and error handler.\n\
14142Otherwise, returns the result of object.__str__() (if defined)\n\
14143or repr(object).\n\
14144encoding defaults to sys.getdefaultencoding().\n\
14145errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014146
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014147static PyObject *unicode_iter(PyObject *seq);
14148
Guido van Rossumd57fd912000-03-10 22:53:23 +000014149PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014150 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 "str", /* tp_name */
14152 sizeof(PyUnicodeObject), /* tp_size */
14153 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014154 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 (destructor)unicode_dealloc, /* tp_dealloc */
14156 0, /* tp_print */
14157 0, /* tp_getattr */
14158 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014159 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 unicode_repr, /* tp_repr */
14161 &unicode_as_number, /* tp_as_number */
14162 &unicode_as_sequence, /* tp_as_sequence */
14163 &unicode_as_mapping, /* tp_as_mapping */
14164 (hashfunc) unicode_hash, /* tp_hash*/
14165 0, /* tp_call*/
14166 (reprfunc) unicode_str, /* tp_str */
14167 PyObject_GenericGetAttr, /* tp_getattro */
14168 0, /* tp_setattro */
14169 0, /* tp_as_buffer */
14170 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014171 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 unicode_doc, /* tp_doc */
14173 0, /* tp_traverse */
14174 0, /* tp_clear */
14175 PyUnicode_RichCompare, /* tp_richcompare */
14176 0, /* tp_weaklistoffset */
14177 unicode_iter, /* tp_iter */
14178 0, /* tp_iternext */
14179 unicode_methods, /* tp_methods */
14180 0, /* tp_members */
14181 0, /* tp_getset */
14182 &PyBaseObject_Type, /* tp_base */
14183 0, /* tp_dict */
14184 0, /* tp_descr_get */
14185 0, /* tp_descr_set */
14186 0, /* tp_dictoffset */
14187 0, /* tp_init */
14188 0, /* tp_alloc */
14189 unicode_new, /* tp_new */
14190 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014191};
14192
14193/* Initialize the Unicode implementation */
14194
Victor Stinner3a50e702011-10-18 21:21:00 +020014195int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014197 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014198 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014199 0x000A, /* LINE FEED */
14200 0x000D, /* CARRIAGE RETURN */
14201 0x001C, /* FILE SEPARATOR */
14202 0x001D, /* GROUP SEPARATOR */
14203 0x001E, /* RECORD SEPARATOR */
14204 0x0085, /* NEXT LINE */
14205 0x2028, /* LINE SEPARATOR */
14206 0x2029, /* PARAGRAPH SEPARATOR */
14207 };
14208
Fred Drakee4315f52000-05-09 19:53:39 +000014209 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014210 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014211 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014212 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014213 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014214
Guido van Rossumcacfc072002-05-24 19:01:59 +000014215 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014216 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014217
14218 /* initialize the linebreak bloom filter */
14219 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014220 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014221 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014222
Christian Heimes26532f72013-07-20 14:57:16 +020014223 if (PyType_Ready(&EncodingMapType) < 0)
14224 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014225
Benjamin Petersonc4311282012-10-30 23:21:10 -040014226 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14227 Py_FatalError("Can't initialize field name iterator type");
14228
14229 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14230 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014231
Victor Stinner3a50e702011-10-18 21:21:00 +020014232#ifdef HAVE_MBCS
14233 winver.dwOSVersionInfoSize = sizeof(winver);
14234 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14235 PyErr_SetFromWindowsErr(0);
14236 return -1;
14237 }
14238#endif
14239 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014240}
14241
14242/* Finalize the Unicode implementation */
14243
Christian Heimesa156e092008-02-16 07:38:31 +000014244int
14245PyUnicode_ClearFreeList(void)
14246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014247 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014248}
14249
Guido van Rossumd57fd912000-03-10 22:53:23 +000014250void
Thomas Wouters78890102000-07-22 19:25:51 +000014251_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014252{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014253 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014254
Serhiy Storchaka05997252013-01-26 12:14:02 +020014255 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014256
Serhiy Storchaka05997252013-01-26 12:14:02 +020014257 for (i = 0; i < 256; i++)
14258 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014259 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014260 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014261}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014262
Walter Dörwald16807132007-05-25 13:52:07 +000014263void
14264PyUnicode_InternInPlace(PyObject **p)
14265{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014266 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014268#ifdef Py_DEBUG
14269 assert(s != NULL);
14270 assert(_PyUnicode_CHECK(s));
14271#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014272 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014273 return;
14274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 /* If it's a subclass, we don't really know what putting
14276 it in the interned dict might do. */
14277 if (!PyUnicode_CheckExact(s))
14278 return;
14279 if (PyUnicode_CHECK_INTERNED(s))
14280 return;
14281 if (interned == NULL) {
14282 interned = PyDict_New();
14283 if (interned == NULL) {
14284 PyErr_Clear(); /* Don't leave an exception */
14285 return;
14286 }
14287 }
14288 /* It might be that the GetItem call fails even
14289 though the key is present in the dictionary,
14290 namely when this happens during a stack overflow. */
14291 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014292 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014294
Benjamin Peterson29060642009-01-31 22:14:21 +000014295 if (t) {
14296 Py_INCREF(t);
14297 Py_DECREF(*p);
14298 *p = t;
14299 return;
14300 }
Walter Dörwald16807132007-05-25 13:52:07 +000014301
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014303 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 PyErr_Clear();
14305 PyThreadState_GET()->recursion_critical = 0;
14306 return;
14307 }
14308 PyThreadState_GET()->recursion_critical = 0;
14309 /* The two references in interned are not counted by refcnt.
14310 The deallocator will take care of this */
14311 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014312 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014313}
14314
14315void
14316PyUnicode_InternImmortal(PyObject **p)
14317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014318 PyUnicode_InternInPlace(p);
14319 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014320 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 Py_INCREF(*p);
14322 }
Walter Dörwald16807132007-05-25 13:52:07 +000014323}
14324
14325PyObject *
14326PyUnicode_InternFromString(const char *cp)
14327{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014328 PyObject *s = PyUnicode_FromString(cp);
14329 if (s == NULL)
14330 return NULL;
14331 PyUnicode_InternInPlace(&s);
14332 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014333}
14334
Alexander Belopolsky40018472011-02-26 01:02:56 +000014335void
14336_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014337{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014338 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014339 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014340 Py_ssize_t i, n;
14341 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014342
Benjamin Peterson14339b62009-01-31 16:36:08 +000014343 if (interned == NULL || !PyDict_Check(interned))
14344 return;
14345 keys = PyDict_Keys(interned);
14346 if (keys == NULL || !PyList_Check(keys)) {
14347 PyErr_Clear();
14348 return;
14349 }
Walter Dörwald16807132007-05-25 13:52:07 +000014350
Benjamin Peterson14339b62009-01-31 16:36:08 +000014351 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14352 detector, interned unicode strings are not forcibly deallocated;
14353 rather, we give them their stolen references back, and then clear
14354 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014355
Benjamin Peterson14339b62009-01-31 16:36:08 +000014356 n = PyList_GET_SIZE(keys);
14357 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014358 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014359 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014360 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014361 if (PyUnicode_READY(s) == -1) {
14362 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014363 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014365 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 case SSTATE_NOT_INTERNED:
14367 /* XXX Shouldn't happen */
14368 break;
14369 case SSTATE_INTERNED_IMMORTAL:
14370 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014371 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014372 break;
14373 case SSTATE_INTERNED_MORTAL:
14374 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014375 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014376 break;
14377 default:
14378 Py_FatalError("Inconsistent interned string state.");
14379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 }
14382 fprintf(stderr, "total size of all interned strings: "
14383 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14384 "mortal/immortal\n", mortal_size, immortal_size);
14385 Py_DECREF(keys);
14386 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014387 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014388}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014389
14390
14391/********************* Unicode Iterator **************************/
14392
14393typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014394 PyObject_HEAD
14395 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014396 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014397} unicodeiterobject;
14398
14399static void
14400unicodeiter_dealloc(unicodeiterobject *it)
14401{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014402 _PyObject_GC_UNTRACK(it);
14403 Py_XDECREF(it->it_seq);
14404 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014405}
14406
14407static int
14408unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014410 Py_VISIT(it->it_seq);
14411 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014412}
14413
14414static PyObject *
14415unicodeiter_next(unicodeiterobject *it)
14416{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014417 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014418
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 assert(it != NULL);
14420 seq = it->it_seq;
14421 if (seq == NULL)
14422 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014423 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014425 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14426 int kind = PyUnicode_KIND(seq);
14427 void *data = PyUnicode_DATA(seq);
14428 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14429 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014430 if (item != NULL)
14431 ++it->it_index;
14432 return item;
14433 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014434
Benjamin Peterson14339b62009-01-31 16:36:08 +000014435 Py_DECREF(seq);
14436 it->it_seq = NULL;
14437 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014438}
14439
14440static PyObject *
14441unicodeiter_len(unicodeiterobject *it)
14442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014443 Py_ssize_t len = 0;
14444 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014445 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014446 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014447}
14448
14449PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14450
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014451static PyObject *
14452unicodeiter_reduce(unicodeiterobject *it)
14453{
14454 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014455 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014456 it->it_seq, it->it_index);
14457 } else {
14458 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14459 if (u == NULL)
14460 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014461 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014462 }
14463}
14464
14465PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14466
14467static PyObject *
14468unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14469{
14470 Py_ssize_t index = PyLong_AsSsize_t(state);
14471 if (index == -1 && PyErr_Occurred())
14472 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000014473 if (it->it_seq != NULL) {
14474 if (index < 0)
14475 index = 0;
14476 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14477 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14478 it->it_index = index;
14479 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014480 Py_RETURN_NONE;
14481}
14482
14483PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14484
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014485static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014487 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014488 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14489 reduce_doc},
14490 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14491 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014492 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014493};
14494
14495PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014496 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14497 "str_iterator", /* tp_name */
14498 sizeof(unicodeiterobject), /* tp_basicsize */
14499 0, /* tp_itemsize */
14500 /* methods */
14501 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14502 0, /* tp_print */
14503 0, /* tp_getattr */
14504 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014505 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014506 0, /* tp_repr */
14507 0, /* tp_as_number */
14508 0, /* tp_as_sequence */
14509 0, /* tp_as_mapping */
14510 0, /* tp_hash */
14511 0, /* tp_call */
14512 0, /* tp_str */
14513 PyObject_GenericGetAttr, /* tp_getattro */
14514 0, /* tp_setattro */
14515 0, /* tp_as_buffer */
14516 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14517 0, /* tp_doc */
14518 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14519 0, /* tp_clear */
14520 0, /* tp_richcompare */
14521 0, /* tp_weaklistoffset */
14522 PyObject_SelfIter, /* tp_iter */
14523 (iternextfunc)unicodeiter_next, /* tp_iternext */
14524 unicodeiter_methods, /* tp_methods */
14525 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014526};
14527
14528static PyObject *
14529unicode_iter(PyObject *seq)
14530{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014531 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014532
Benjamin Peterson14339b62009-01-31 16:36:08 +000014533 if (!PyUnicode_Check(seq)) {
14534 PyErr_BadInternalCall();
14535 return NULL;
14536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014537 if (PyUnicode_READY(seq) == -1)
14538 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14540 if (it == NULL)
14541 return NULL;
14542 it->it_index = 0;
14543 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014544 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014545 _PyObject_GC_TRACK(it);
14546 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014547}
14548
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014549
14550size_t
14551Py_UNICODE_strlen(const Py_UNICODE *u)
14552{
14553 int res = 0;
14554 while(*u++)
14555 res++;
14556 return res;
14557}
14558
14559Py_UNICODE*
14560Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14561{
14562 Py_UNICODE *u = s1;
14563 while ((*u++ = *s2++));
14564 return s1;
14565}
14566
14567Py_UNICODE*
14568Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14569{
14570 Py_UNICODE *u = s1;
14571 while ((*u++ = *s2++))
14572 if (n-- == 0)
14573 break;
14574 return s1;
14575}
14576
14577Py_UNICODE*
14578Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14579{
14580 Py_UNICODE *u1 = s1;
14581 u1 += Py_UNICODE_strlen(u1);
14582 Py_UNICODE_strcpy(u1, s2);
14583 return s1;
14584}
14585
14586int
14587Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14588{
14589 while (*s1 && *s2 && *s1 == *s2)
14590 s1++, s2++;
14591 if (*s1 && *s2)
14592 return (*s1 < *s2) ? -1 : +1;
14593 if (*s1)
14594 return 1;
14595 if (*s2)
14596 return -1;
14597 return 0;
14598}
14599
14600int
14601Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14602{
14603 register Py_UNICODE u1, u2;
14604 for (; n != 0; n--) {
14605 u1 = *s1;
14606 u2 = *s2;
14607 if (u1 != u2)
14608 return (u1 < u2) ? -1 : +1;
14609 if (u1 == '\0')
14610 return 0;
14611 s1++;
14612 s2++;
14613 }
14614 return 0;
14615}
14616
14617Py_UNICODE*
14618Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14619{
14620 const Py_UNICODE *p;
14621 for (p = s; *p; p++)
14622 if (*p == c)
14623 return (Py_UNICODE*)p;
14624 return NULL;
14625}
14626
14627Py_UNICODE*
14628Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14629{
14630 const Py_UNICODE *p;
14631 p = s + Py_UNICODE_strlen(s);
14632 while (p != s) {
14633 p--;
14634 if (*p == c)
14635 return (Py_UNICODE*)p;
14636 }
14637 return NULL;
14638}
Victor Stinner331ea922010-08-10 16:37:20 +000014639
Victor Stinner71133ff2010-09-01 23:43:53 +000014640Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014641PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014642{
Victor Stinner577db2c2011-10-11 22:12:48 +020014643 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014644 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014646 if (!PyUnicode_Check(unicode)) {
14647 PyErr_BadArgument();
14648 return NULL;
14649 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014650 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014651 if (u == NULL)
14652 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014653 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014654 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014655 PyErr_NoMemory();
14656 return NULL;
14657 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014658 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014659 size *= sizeof(Py_UNICODE);
14660 copy = PyMem_Malloc(size);
14661 if (copy == NULL) {
14662 PyErr_NoMemory();
14663 return NULL;
14664 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014665 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014666 return copy;
14667}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014668
Georg Brandl66c221e2010-10-14 07:04:07 +000014669/* A _string module, to export formatter_parser and formatter_field_name_split
14670 to the string.Formatter class implemented in Python. */
14671
14672static PyMethodDef _string_methods[] = {
14673 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14674 METH_O, PyDoc_STR("split the argument as a field name")},
14675 {"formatter_parser", (PyCFunction) formatter_parser,
14676 METH_O, PyDoc_STR("parse the argument as a format string")},
14677 {NULL, NULL}
14678};
14679
14680static struct PyModuleDef _string_module = {
14681 PyModuleDef_HEAD_INIT,
14682 "_string",
14683 PyDoc_STR("string helper module"),
14684 0,
14685 _string_methods,
14686 NULL,
14687 NULL,
14688 NULL,
14689 NULL
14690};
14691
14692PyMODINIT_FUNC
14693PyInit__string(void)
14694{
14695 return PyModule_Create(&_string_module);
14696}
14697
14698
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014699#ifdef __cplusplus
14700}
14701#endif