blob: 9f269a5bc2e9aff0d3afb4fa2100b972414c80ac [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinnere6abb482012-05-02 01:15:40 +0200115/* Optimized version of Py_MAX() to compute the maximum character:
116 use it when your are computing the second argument of PyUnicode_New() */
117#define MAX_MAXCHAR(maxchar1, maxchar2) \
118 ((maxchar1) | (maxchar2))
119
Victor Stinner910337b2011-10-03 03:20:16 +0200120#undef PyUnicode_READY
121#define PyUnicode_READY(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200124 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100125 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200126
Victor Stinnerc379ead2011-10-03 12:52:27 +0200127#define _PyUnicode_SHARE_UTF8(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
130 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
131#define _PyUnicode_SHARE_WSTR(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
134
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135/* true if the Unicode object has an allocated UTF-8 memory block
136 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_HAS_UTF8_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (!PyUnicode_IS_COMPACT_ASCII(op) \
140 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200141 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
142
Victor Stinner03490912011-10-03 23:45:12 +0200143/* true if the Unicode object has an allocated wstr memory block
144 (not shared with other data) */
145#define _PyUnicode_HAS_WSTR_MEMORY(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 (_PyUnicode_WSTR(op) && \
148 (!PyUnicode_IS_READY(op) || \
149 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
150
Victor Stinner910337b2011-10-03 03:20:16 +0200151/* Generic helper macro to convert characters of different types.
152 from_type and to_type have to be valid type names, begin and end
153 are pointers to the source characters which should be of type
154 "from_type *". to is a pointer of type "to_type *" and points to the
155 buffer where the result characters are written to. */
156#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
157 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200158 to_type *_to = (to_type *) to; \
159 const from_type *_iter = (begin); \
160 const from_type *_end = (end); \
161 Py_ssize_t n = (_end) - (_iter); \
162 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200163 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_unrolled_end)) { \
165 _to[0] = (to_type) _iter[0]; \
166 _to[1] = (to_type) _iter[1]; \
167 _to[2] = (to_type) _iter[2]; \
168 _to[3] = (to_type) _iter[3]; \
169 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200170 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200171 while (_iter < (_end)) \
172 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200173 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200174
Walter Dörwald16807132007-05-25 13:52:07 +0000175/* This dictionary holds all interned unicode strings. Note that references
176 to strings in this dictionary are *not* counted in the string's ob_refcnt.
177 When the interned string reaches a refcnt of 0 the string deallocation
178 function will delete the reference from this dictionary.
179
180 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000181 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000182*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 do { \
190 if (unicode_empty != NULL) \
191 Py_INCREF(unicode_empty); \
192 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193 unicode_empty = PyUnicode_New(0, 0); \
194 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200195 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_RETURN_UNICODE_EMPTY() \
202 do { \
203 _Py_INCREF_UNICODE_EMPTY(); \
204 return unicode_empty; \
205 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200207/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* Single character Unicode strings in the Latin-1 range are being
211 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Christian Heimes190d79e2008-01-30 11:58:22 +0000214/* Fast detection of the most frequent whitespace characters */
215const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000217/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000219/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x000C: * FORM FEED */
221/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 1, 1, 1, 1, 1, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x001C: * FILE SEPARATOR */
225/* case 0x001D: * GROUP SEPARATOR */
226/* case 0x001E: * RECORD SEPARATOR */
227/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000229/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 1, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000234
Benjamin Peterson14339b62009-01-31 16:36:08 +0000235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000243};
244
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200245/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200247static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100248static int unicode_modifiable(PyObject *unicode);
249
Victor Stinnerfe226c02011-10-03 03:52:20 +0200250
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100252_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200253static PyObject *
254_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
255static PyObject *
256_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
257
258static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000260 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100261 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000262 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264static void
265raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300266 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100267 PyObject *unicode,
268 Py_ssize_t startpos, Py_ssize_t endpos,
269 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000270
Christian Heimes190d79e2008-01-30 11:58:22 +0000271/* Same for linebreaks */
272static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000275/* 0x000B, * LINE TABULATION */
276/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x001C, * FILE SEPARATOR */
281/* 0x001D, * GROUP SEPARATOR */
282/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000288
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000297};
298
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300299/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
300 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000302PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000303{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000304#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000305 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 /* This is actually an illegal character, so it should
308 not be passed to unichr. */
309 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000310#endif
311}
312
Victor Stinner910337b2011-10-03 03:20:16 +0200313#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200314int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100315_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200316{
317 PyASCIIObject *ascii;
318 unsigned int kind;
319
320 assert(PyUnicode_Check(op));
321
322 ascii = (PyASCIIObject *)op;
323 kind = ascii->state.kind;
324
Victor Stinnera3b334d2011-10-03 13:53:37 +0200325 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200326 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(ascii->state.ready == 1);
328 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200330 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200331 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200332
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 if (ascii->state.compact == 1) {
334 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(kind == PyUnicode_1BYTE_KIND
336 || kind == PyUnicode_2BYTE_KIND
337 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200339 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200340 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 }
342 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
344
345 data = unicode->data.any;
346 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->length == 0);
348 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->state.compact == 0);
350 assert(ascii->state.ascii == 0);
351 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100352 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200353 assert(ascii->wstr != NULL);
354 assert(data == NULL);
355 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200356 }
357 else {
358 assert(kind == PyUnicode_1BYTE_KIND
359 || kind == PyUnicode_2BYTE_KIND
360 || kind == PyUnicode_4BYTE_KIND);
361 assert(ascii->state.compact == 0);
362 assert(ascii->state.ready == 1);
363 assert(data != NULL);
364 if (ascii->state.ascii) {
365 assert (compact->utf8 == data);
366 assert (compact->utf8_length == ascii->length);
367 }
368 else
369 assert (compact->utf8 != data);
370 }
371 }
372 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200373 if (
374#if SIZEOF_WCHAR_T == 2
375 kind == PyUnicode_2BYTE_KIND
376#else
377 kind == PyUnicode_4BYTE_KIND
378#endif
379 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 {
381 assert(ascii->wstr == data);
382 assert(compact->wstr_length == ascii->length);
383 } else
384 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200386
387 if (compact->utf8 == NULL)
388 assert(compact->utf8_length == 0);
389 if (ascii->wstr == NULL)
390 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 /* check that the best kind is used */
393 if (check_content && kind != PyUnicode_WCHAR_KIND)
394 {
395 Py_ssize_t i;
396 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 void *data;
398 Py_UCS4 ch;
399
400 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 for (i=0; i < ascii->length; i++)
402 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 if (ch > maxchar)
405 maxchar = ch;
406 }
407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100421 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100422 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200423 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200424 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400425 return 1;
426}
Victor Stinner910337b2011-10-03 03:20:16 +0200427#endif
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429static PyObject*
430unicode_result_wchar(PyObject *unicode)
431{
432#ifndef Py_DEBUG
433 Py_ssize_t len;
434
435 assert(Py_REFCNT(unicode) == 1);
436
437 len = _PyUnicode_WSTR_LENGTH(unicode);
438 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100439 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200440 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 }
442
443 if (len == 1) {
444 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100445 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
447 Py_DECREF(unicode);
448 return latin1_char;
449 }
450 }
451
452 if (_PyUnicode_Ready(unicode) < 0) {
453 Py_XDECREF(unicode);
454 return NULL;
455 }
456#else
457 /* don't make the result ready in debug mode to ensure that the caller
458 makes the string ready before using it */
459 assert(_PyUnicode_CheckConsistency(unicode, 1));
460#endif
461 return unicode;
462}
463
464static PyObject*
465unicode_result_ready(PyObject *unicode)
466{
467 Py_ssize_t length;
468
469 length = PyUnicode_GET_LENGTH(unicode);
470 if (length == 0) {
471 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200473 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 }
475 return unicode_empty;
476 }
477
478 if (length == 1) {
479 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
480 if (ch < 256) {
481 PyObject *latin1_char = unicode_latin1[ch];
482 if (latin1_char != NULL) {
483 if (unicode != latin1_char) {
484 Py_INCREF(latin1_char);
485 Py_DECREF(unicode);
486 }
487 return latin1_char;
488 }
489 else {
490 assert(_PyUnicode_CheckConsistency(unicode, 1));
491 Py_INCREF(unicode);
492 unicode_latin1[ch] = unicode;
493 return unicode;
494 }
495 }
496 }
497
498 assert(_PyUnicode_CheckConsistency(unicode, 1));
499 return unicode;
500}
501
502static PyObject*
503unicode_result(PyObject *unicode)
504{
505 assert(_PyUnicode_CHECK(unicode));
506 if (PyUnicode_IS_READY(unicode))
507 return unicode_result_ready(unicode);
508 else
509 return unicode_result_wchar(unicode);
510}
511
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512static PyObject*
513unicode_result_unchanged(PyObject *unicode)
514{
515 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500516 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100517 return NULL;
518 Py_INCREF(unicode);
519 return unicode;
520 }
521 else
522 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100523 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100524}
525
Victor Stinner3a50e702011-10-18 21:21:00 +0200526#ifdef HAVE_MBCS
527static OSVERSIONINFOEX winver;
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530/* --- Bloom Filters ----------------------------------------------------- */
531
532/* stuff to implement simple "bloom filters" for Unicode characters.
533 to keep things simple, we use a single bitmask, using the least 5
534 bits from each unicode characters as the bit index. */
535
536/* the linebreak mask is set up by Unicode_Init below */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538#if LONG_BIT >= 128
539#define BLOOM_WIDTH 128
540#elif LONG_BIT >= 64
541#define BLOOM_WIDTH 64
542#elif LONG_BIT >= 32
543#define BLOOM_WIDTH 32
544#else
545#error "LONG_BIT is smaller than 32"
546#endif
547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548#define BLOOM_MASK unsigned long
549
Serhiy Storchaka05997252013-01-26 12:14:02 +0200550static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitrouf068f942010-01-13 14:19:12 +0000552#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
553#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Benjamin Peterson29060642009-01-31 22:14:21 +0000555#define BLOOM_LINEBREAK(ch) \
556 ((ch) < 128U ? ascii_linebreak[(ch)] : \
557 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Alexander Belopolsky40018472011-02-26 01:02:56 +0000559Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000561{
562 /* calculate simple bloom-style bitmask for a given unicode string */
563
Antoine Pitrouf068f942010-01-13 14:19:12 +0000564 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565 Py_ssize_t i;
566
567 mask = 0;
568 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 return mask;
572}
573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574#define BLOOM_MEMBER(mask, chr, str) \
575 (BLOOM(mask, chr) \
576 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000577
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200578/* Compilation of templated routines */
579
580#include "stringlib/asciilib.h"
581#include "stringlib/fastsearch.h"
582#include "stringlib/partition.h"
583#include "stringlib/split.h"
584#include "stringlib/count.h"
585#include "stringlib/find.h"
586#include "stringlib/find_max_char.h"
587#include "stringlib/localeutil.h"
588#include "stringlib/undef.h"
589
590#include "stringlib/ucs1lib.h"
591#include "stringlib/fastsearch.h"
592#include "stringlib/partition.h"
593#include "stringlib/split.h"
594#include "stringlib/count.h"
595#include "stringlib/find.h"
596#include "stringlib/find_max_char.h"
597#include "stringlib/localeutil.h"
598#include "stringlib/undef.h"
599
600#include "stringlib/ucs2lib.h"
601#include "stringlib/fastsearch.h"
602#include "stringlib/partition.h"
603#include "stringlib/split.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs4lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
616#include "stringlib/find_max_char.h"
617#include "stringlib/localeutil.h"
618#include "stringlib/undef.h"
619
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620#include "stringlib/unicodedefs.h"
621#include "stringlib/fastsearch.h"
622#include "stringlib/count.h"
623#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100624#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200625
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626/* --- Unicode Object ----------------------------------------------------- */
627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200629fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200631Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
632 Py_ssize_t size, Py_UCS4 ch,
633 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
636
637 switch (kind) {
638 case PyUnicode_1BYTE_KIND:
639 {
640 Py_UCS1 ch1 = (Py_UCS1) ch;
641 if (ch1 == ch)
642 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
643 else
644 return -1;
645 }
646 case PyUnicode_2BYTE_KIND:
647 {
648 Py_UCS2 ch2 = (Py_UCS2) ch;
649 if (ch2 == ch)
650 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
651 else
652 return -1;
653 }
654 case PyUnicode_4BYTE_KIND:
655 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
656 default:
657 assert(0);
658 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660}
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200670 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100672 assert(PyUnicode_IS_COMPACT(unicode));
673
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200674 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100675 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 struct_size = sizeof(PyASCIIObject);
677 else
678 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
682 PyErr_NoMemory();
683 return NULL;
684 }
685 new_size = (struct_size + (length + 1) * char_size);
686
Victor Stinner84def372011-12-11 20:04:56 +0100687 _Py_DEC_REFTOTAL;
688 _Py_ForgetReference(unicode);
689
690 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
691 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100692 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 PyErr_NoMemory();
694 return NULL;
695 }
Victor Stinner84def372011-12-11 20:04:56 +0100696 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200700 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100702 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200703 _PyUnicode_WSTR_LENGTH(unicode) = length;
704 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100705 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
706 PyObject_DEL(_PyUnicode_WSTR(unicode));
707 _PyUnicode_WSTR(unicode) = NULL;
708 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
710 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200711 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 return unicode;
713}
714
Alexander Belopolsky40018472011-02-26 01:02:56 +0000715static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200716resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717{
Victor Stinner95663112011-10-04 01:03:50 +0200718 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100719 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 if (PyUnicode_IS_READY(unicode)) {
724 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200725 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 void *data;
727
728 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200729 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
731 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732
733 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
734 PyErr_NoMemory();
735 return -1;
736 }
737 new_size = (length + 1) * char_size;
738
Victor Stinner7a9105a2011-12-12 00:13:42 +0100739 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
740 {
741 PyObject_DEL(_PyUnicode_UTF8(unicode));
742 _PyUnicode_UTF8(unicode) = NULL;
743 _PyUnicode_UTF8_LENGTH(unicode) = 0;
744 }
745
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 data = (PyObject *)PyObject_REALLOC(data, new_size);
747 if (data == NULL) {
748 PyErr_NoMemory();
749 return -1;
750 }
751 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200754 _PyUnicode_WSTR_LENGTH(unicode) = length;
755 }
756 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200757 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 _PyUnicode_UTF8_LENGTH(unicode) = length;
759 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 _PyUnicode_LENGTH(unicode) = length;
761 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200762 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200763 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinner95663112011-10-04 01:03:50 +0200767 assert(_PyUnicode_WSTR(unicode) != NULL);
768
769 /* check for integer overflow */
770 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
771 PyErr_NoMemory();
772 return -1;
773 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100774 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200775 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100776 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200777 if (!wstr) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 _PyUnicode_WSTR(unicode) = wstr;
782 _PyUnicode_WSTR(unicode)[length] = 0;
783 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200784 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785 return 0;
786}
787
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788static PyObject*
789resize_copy(PyObject *unicode, Py_ssize_t length)
790{
791 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100792 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100794
Benjamin Petersonbac79492012-01-14 13:34:47 -0500795 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100796 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797
798 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
799 if (copy == NULL)
800 return NULL;
801
802 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200803 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200805 }
806 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200807 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100808
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200809 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 if (w == NULL)
811 return NULL;
812 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
813 copy_length = Py_MIN(copy_length, length);
814 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
815 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
818}
819
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000821 Ux0000 terminated; some code (e.g. new_identifier)
822 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823
824 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000825 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826
827*/
828
Alexander Belopolsky40018472011-02-26 01:02:56 +0000829static PyUnicodeObject *
830_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831{
832 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Thomas Wouters477c8d52006-05-27 19:21:47 +0000835 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (length == 0 && unicode_empty != NULL) {
837 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200838 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 }
840
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000841 /* Ensure we won't overflow the size. */
842 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
843 return (PyUnicodeObject *)PyErr_NoMemory();
844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845 if (length < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to _PyUnicode_New");
848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000849 }
850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
852 if (unicode == NULL)
853 return NULL;
854 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
855 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
856 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100857 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000858 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100859 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861
Jeremy Hyltond8082792003-09-16 19:41:39 +0000862 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000863 * the caller fails before initializing str -- unicode_resize()
864 * reads str[0], and the Keep-Alive optimization can keep memory
865 * allocated for str alive across a call to unicode_dealloc(unicode).
866 * We don't want unicode_resize to read uninitialized memory in
867 * that case.
868 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869 _PyUnicode_WSTR(unicode)[0] = 0;
870 _PyUnicode_WSTR(unicode)[length] = 0;
871 _PyUnicode_WSTR_LENGTH(unicode) = length;
872 _PyUnicode_HASH(unicode) = -1;
873 _PyUnicode_STATE(unicode).interned = 0;
874 _PyUnicode_STATE(unicode).kind = 0;
875 _PyUnicode_STATE(unicode).compact = 0;
876 _PyUnicode_STATE(unicode).ready = 0;
877 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200878 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200880 _PyUnicode_UTF8(unicode) = NULL;
881 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100882 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 return unicode;
884}
885
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886static const char*
887unicode_kind_name(PyObject *unicode)
888{
Victor Stinner42dfd712011-10-03 14:41:45 +0200889 /* don't check consistency: unicode_kind_name() is called from
890 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 if (!PyUnicode_IS_COMPACT(unicode))
892 {
893 if (!PyUnicode_IS_READY(unicode))
894 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 {
897 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 return "legacy ascii";
900 else
901 return "legacy latin1";
902 case PyUnicode_2BYTE_KIND:
903 return "legacy UCS2";
904 case PyUnicode_4BYTE_KIND:
905 return "legacy UCS4";
906 default:
907 return "<legacy invalid kind>";
908 }
909 }
910 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600911 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200913 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200914 return "ascii";
915 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200916 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200917 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200918 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200919 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200920 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 default:
922 return "<invalid compact kind>";
923 }
924}
925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927/* Functions wrapping macros for use in debugger */
928char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200929 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930}
931
932void *_PyUnicode_compact_data(void *unicode) {
933 return _PyUnicode_COMPACT_DATA(unicode);
934}
935void *_PyUnicode_data(void *unicode){
936 printf("obj %p\n", unicode);
937 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
938 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
939 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
940 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
941 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
942 return PyUnicode_DATA(unicode);
943}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200944
945void
946_PyUnicode_Dump(PyObject *op)
947{
948 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
950 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
951 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200952
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200954 {
955 if (ascii->state.ascii)
956 data = (ascii + 1);
957 else
958 data = (compact + 1);
959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 else
961 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200962 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
963
Victor Stinnera849a4b2011-10-03 12:12:11 +0200964 if (ascii->wstr == data)
965 printf("shared ");
966 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200967
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 printf(" (%zu), ", compact->wstr_length);
970 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
971 printf("shared ");
972 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200973 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200974 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200975}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#endif
977
978PyObject *
979PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
980{
981 PyObject *obj;
982 PyCompactUnicodeObject *unicode;
983 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200984 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 Py_ssize_t char_size;
987 Py_ssize_t struct_size;
988
989 /* Optimization for empty strings */
990 if (size == 0 && unicode_empty != NULL) {
991 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200992 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 }
994
Victor Stinner9e9d6892011-10-04 01:02:02 +0200995 is_ascii = 0;
996 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 struct_size = sizeof(PyCompactUnicodeObject);
998 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200999 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 char_size = 1;
1001 is_ascii = 1;
1002 struct_size = sizeof(PyASCIIObject);
1003 }
1004 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001005 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 char_size = 1;
1007 }
1008 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001009 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001010 char_size = 2;
1011 if (sizeof(wchar_t) == 2)
1012 is_sharing = 1;
1013 }
1014 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001015 if (maxchar > MAX_UNICODE) {
1016 PyErr_SetString(PyExc_SystemError,
1017 "invalid maximum character passed to PyUnicode_New");
1018 return NULL;
1019 }
Victor Stinner8f825062012-04-27 13:55:39 +02001020 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 char_size = 4;
1022 if (sizeof(wchar_t) == 4)
1023 is_sharing = 1;
1024 }
1025
1026 /* Ensure we won't overflow the size. */
1027 if (size < 0) {
1028 PyErr_SetString(PyExc_SystemError,
1029 "Negative size passed to PyUnicode_New");
1030 return NULL;
1031 }
1032 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1033 return PyErr_NoMemory();
1034
1035 /* Duplicated allocation code from _PyObject_New() instead of a call to
1036 * PyObject_New() so we are able to allocate space for the object and
1037 * it's data buffer.
1038 */
1039 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1040 if (obj == NULL)
1041 return PyErr_NoMemory();
1042 obj = PyObject_INIT(obj, &PyUnicode_Type);
1043 if (obj == NULL)
1044 return NULL;
1045
1046 unicode = (PyCompactUnicodeObject *)obj;
1047 if (is_ascii)
1048 data = ((PyASCIIObject*)obj) + 1;
1049 else
1050 data = unicode + 1;
1051 _PyUnicode_LENGTH(unicode) = size;
1052 _PyUnicode_HASH(unicode) = -1;
1053 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001054 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 _PyUnicode_STATE(unicode).compact = 1;
1056 _PyUnicode_STATE(unicode).ready = 1;
1057 _PyUnicode_STATE(unicode).ascii = is_ascii;
1058 if (is_ascii) {
1059 ((char*)data)[size] = 0;
1060 _PyUnicode_WSTR(unicode) = NULL;
1061 }
Victor Stinner8f825062012-04-27 13:55:39 +02001062 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 ((char*)data)[size] = 0;
1064 _PyUnicode_WSTR(unicode) = NULL;
1065 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001067 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 else {
1070 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001071 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001072 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001074 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 ((Py_UCS4*)data)[size] = 0;
1076 if (is_sharing) {
1077 _PyUnicode_WSTR_LENGTH(unicode) = size;
1078 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1079 }
1080 else {
1081 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1082 _PyUnicode_WSTR(unicode) = NULL;
1083 }
1084 }
Victor Stinner8f825062012-04-27 13:55:39 +02001085#ifdef Py_DEBUG
1086 /* Fill the data with invalid characters to detect bugs earlier.
1087 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1088 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1089 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1090 memset(data, 0xff, size * kind);
1091#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001092 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 return obj;
1094}
1095
1096#if SIZEOF_WCHAR_T == 2
1097/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1098 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001099 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100
1101 This function assumes that unicode can hold one more code point than wstr
1102 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001103static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001105 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106{
1107 const wchar_t *iter;
1108 Py_UCS4 *ucs4_out;
1109
Victor Stinner910337b2011-10-03 03:20:16 +02001110 assert(unicode != NULL);
1111 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1113 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1114
1115 for (iter = begin; iter < end; ) {
1116 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1117 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001118 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1119 && (iter+1) < end
1120 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 {
Victor Stinner551ac952011-11-29 22:58:13 +01001122 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 iter += 2;
1124 }
1125 else {
1126 *ucs4_out++ = *iter;
1127 iter++;
1128 }
1129 }
1130 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1131 _PyUnicode_GET_LENGTH(unicode)));
1132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133}
1134#endif
1135
Victor Stinnercd9950f2011-10-02 00:34:53 +02001136static int
Victor Stinner488fa492011-12-12 00:01:39 +01001137unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001138{
Victor Stinner488fa492011-12-12 00:01:39 +01001139 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001140 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001141 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001142 return -1;
1143 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001144 return 0;
1145}
1146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147static int
1148_copy_characters(PyObject *to, Py_ssize_t to_start,
1149 PyObject *from, Py_ssize_t from_start,
1150 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001152 unsigned int from_kind, to_kind;
1153 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154
Victor Stinneree4544c2012-05-09 22:24:08 +02001155 assert(0 <= how_many);
1156 assert(0 <= from_start);
1157 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001158 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001159 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161
Victor Stinnerd3f08822012-05-29 12:57:52 +02001162 assert(PyUnicode_Check(to));
1163 assert(PyUnicode_IS_READY(to));
1164 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1165
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001166 if (how_many == 0)
1167 return 0;
1168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001170 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001172 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173
Victor Stinnerf1852262012-06-16 16:38:26 +02001174#ifdef Py_DEBUG
1175 if (!check_maxchar
1176 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1177 {
1178 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1179 Py_UCS4 ch;
1180 Py_ssize_t i;
1181 for (i=0; i < how_many; i++) {
1182 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1183 assert(ch <= to_maxchar);
1184 }
1185 }
1186#endif
1187
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001188 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001189 if (check_maxchar
1190 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1191 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001192 /* Writing Latin-1 characters into an ASCII string requires to
1193 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 Py_UCS4 max_char;
1195 max_char = ucs1lib_find_max_char(from_data,
1196 (Py_UCS1*)from_data + how_many);
1197 if (max_char >= 128)
1198 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001199 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001200 Py_MEMCPY((char*)to_data + to_kind * to_start,
1201 (char*)from_data + from_kind * from_start,
1202 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else if (from_kind == PyUnicode_1BYTE_KIND
1205 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 {
1207 _PyUnicode_CONVERT_BYTES(
1208 Py_UCS1, Py_UCS2,
1209 PyUnicode_1BYTE_DATA(from) + from_start,
1210 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1211 PyUnicode_2BYTE_DATA(to) + to_start
1212 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001213 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001214 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001215 && to_kind == PyUnicode_4BYTE_KIND)
1216 {
1217 _PyUnicode_CONVERT_BYTES(
1218 Py_UCS1, Py_UCS4,
1219 PyUnicode_1BYTE_DATA(from) + from_start,
1220 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1221 PyUnicode_4BYTE_DATA(to) + to_start
1222 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001223 }
1224 else if (from_kind == PyUnicode_2BYTE_KIND
1225 && to_kind == PyUnicode_4BYTE_KIND)
1226 {
1227 _PyUnicode_CONVERT_BYTES(
1228 Py_UCS2, Py_UCS4,
1229 PyUnicode_2BYTE_DATA(from) + from_start,
1230 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1231 PyUnicode_4BYTE_DATA(to) + to_start
1232 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001233 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1236
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001237 if (!check_maxchar) {
1238 if (from_kind == PyUnicode_2BYTE_KIND
1239 && to_kind == PyUnicode_1BYTE_KIND)
1240 {
1241 _PyUnicode_CONVERT_BYTES(
1242 Py_UCS2, Py_UCS1,
1243 PyUnicode_2BYTE_DATA(from) + from_start,
1244 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1245 PyUnicode_1BYTE_DATA(to) + to_start
1246 );
1247 }
1248 else if (from_kind == PyUnicode_4BYTE_KIND
1249 && to_kind == PyUnicode_1BYTE_KIND)
1250 {
1251 _PyUnicode_CONVERT_BYTES(
1252 Py_UCS4, Py_UCS1,
1253 PyUnicode_4BYTE_DATA(from) + from_start,
1254 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1255 PyUnicode_1BYTE_DATA(to) + to_start
1256 );
1257 }
1258 else if (from_kind == PyUnicode_4BYTE_KIND
1259 && to_kind == PyUnicode_2BYTE_KIND)
1260 {
1261 _PyUnicode_CONVERT_BYTES(
1262 Py_UCS4, Py_UCS2,
1263 PyUnicode_4BYTE_DATA(from) + from_start,
1264 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1265 PyUnicode_2BYTE_DATA(to) + to_start
1266 );
1267 }
1268 else {
1269 assert(0);
1270 return -1;
1271 }
1272 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001273 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001275 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001276 Py_ssize_t i;
1277
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 for (i=0; i < how_many; i++) {
1279 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (ch > to_maxchar)
1281 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 }
1285 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001286 return 0;
1287}
1288
Victor Stinnerd3f08822012-05-29 12:57:52 +02001289void
1290_PyUnicode_FastCopyCharacters(
1291 PyObject *to, Py_ssize_t to_start,
1292 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001293{
1294 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1295}
1296
1297Py_ssize_t
1298PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1299 PyObject *from, Py_ssize_t from_start,
1300 Py_ssize_t how_many)
1301{
1302 int err;
1303
1304 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1305 PyErr_BadInternalCall();
1306 return -1;
1307 }
1308
Benjamin Petersonbac79492012-01-14 13:34:47 -05001309 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001310 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001311 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001312 return -1;
1313
Victor Stinnerd3f08822012-05-29 12:57:52 +02001314 if (from_start < 0) {
1315 PyErr_SetString(PyExc_IndexError, "string index out of range");
1316 return -1;
1317 }
1318 if (to_start < 0) {
1319 PyErr_SetString(PyExc_IndexError, "string index out of range");
1320 return -1;
1321 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001322 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1323 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1324 PyErr_Format(PyExc_SystemError,
1325 "Cannot write %zi characters at %zi "
1326 "in a string of %zi characters",
1327 how_many, to_start, PyUnicode_GET_LENGTH(to));
1328 return -1;
1329 }
1330
1331 if (how_many == 0)
1332 return 0;
1333
Victor Stinner488fa492011-12-12 00:01:39 +01001334 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001335 return -1;
1336
1337 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1338 if (err) {
1339 PyErr_Format(PyExc_SystemError,
1340 "Cannot copy %s characters "
1341 "into a string of %s characters",
1342 unicode_kind_name(from),
1343 unicode_kind_name(to));
1344 return -1;
1345 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001346 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347}
1348
Victor Stinner17222162011-09-28 22:15:37 +02001349/* Find the maximum code point and count the number of surrogate pairs so a
1350 correct string length can be computed before converting a string to UCS4.
1351 This function counts single surrogates as a character and not as a pair.
1352
1353 Return 0 on success, or -1 on error. */
1354static int
1355find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1356 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357{
1358 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001359 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360
Victor Stinnerc53be962011-10-02 21:33:54 +02001361 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 *num_surrogates = 0;
1363 *maxchar = 0;
1364
1365 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001367 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1368 && (iter+1) < end
1369 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001371 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373 iter += 2;
1374 }
1375 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001377 {
1378 ch = *iter;
1379 iter++;
1380 }
1381 if (ch > *maxchar) {
1382 *maxchar = ch;
1383 if (*maxchar > MAX_UNICODE) {
1384 PyErr_Format(PyExc_ValueError,
1385 "character U+%x is not in range [U+0000; U+10ffff]",
1386 ch);
1387 return -1;
1388 }
1389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 return 0;
1392}
1393
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001394int
1395_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396{
1397 wchar_t *end;
1398 Py_UCS4 maxchar = 0;
1399 Py_ssize_t num_surrogates;
1400#if SIZEOF_WCHAR_T == 2
1401 Py_ssize_t length_wo_surrogates;
1402#endif
1403
Georg Brandl7597add2011-10-05 16:36:47 +02001404 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001405 strings were created using _PyObject_New() and where no canonical
1406 representation (the str field) has been set yet aka strings
1407 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001408 assert(_PyUnicode_CHECK(unicode));
1409 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001412 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001413 /* Actually, it should neither be interned nor be anything else: */
1414 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001417 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
1421 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1423 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 PyErr_NoMemory();
1425 return -1;
1426 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001427 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 _PyUnicode_WSTR(unicode), end,
1429 PyUnicode_1BYTE_DATA(unicode));
1430 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1431 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1432 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1433 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001434 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001435 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 PyObject_FREE(_PyUnicode_WSTR(unicode));
1444 _PyUnicode_WSTR(unicode) = NULL;
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446 }
1447 /* In this case we might have to convert down from 4-byte native
1448 wchar_t to 2-byte unicode. */
1449 else if (maxchar < 65536) {
1450 assert(num_surrogates == 0 &&
1451 "FindMaxCharAndNumSurrogatePairs() messed up");
1452
Victor Stinner506f5922011-09-28 22:34:18 +02001453#if SIZEOF_WCHAR_T == 2
1454 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001456 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1457 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1458 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 _PyUnicode_UTF8(unicode) = NULL;
1460 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001461#else
1462 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001464 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001466 PyErr_NoMemory();
1467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 }
Victor Stinner506f5922011-09-28 22:34:18 +02001469 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1470 _PyUnicode_WSTR(unicode), end,
1471 PyUnicode_2BYTE_DATA(unicode));
1472 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1474 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001475 _PyUnicode_UTF8(unicode) = NULL;
1476 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001477 PyObject_FREE(_PyUnicode_WSTR(unicode));
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
1482 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1483 else {
1484#if SIZEOF_WCHAR_T == 2
1485 /* in case the native representation is 2-bytes, we need to allocate a
1486 new normalized 4-byte version. */
1487 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001488 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1489 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 PyErr_NoMemory();
1491 return -1;
1492 }
1493 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1494 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001497 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1498 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001499 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 PyObject_FREE(_PyUnicode_WSTR(unicode));
1501 _PyUnicode_WSTR(unicode) = NULL;
1502 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1503#else
1504 assert(num_surrogates == 0);
1505
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001508 _PyUnicode_UTF8(unicode) = NULL;
1509 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1511#endif
1512 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1513 }
1514 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001515 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 return 0;
1517}
1518
Alexander Belopolsky40018472011-02-26 01:02:56 +00001519static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001520unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521{
Walter Dörwald16807132007-05-25 13:52:07 +00001522 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_NOT_INTERNED:
1524 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001525
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 case SSTATE_INTERNED_MORTAL:
1527 /* revive dead object temporarily for DelItem */
1528 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001529 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001530 Py_FatalError(
1531 "deletion of interned string failed");
1532 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 case SSTATE_INTERNED_IMMORTAL:
1535 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536
Benjamin Peterson29060642009-01-31 22:14:21 +00001537 default:
1538 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001539 }
1540
Victor Stinner03490912011-10-03 23:45:12 +02001541 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001543 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001544 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1546 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001548 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549}
1550
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001551#ifdef Py_DEBUG
1552static int
1553unicode_is_singleton(PyObject *unicode)
1554{
1555 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1556 if (unicode == unicode_empty)
1557 return 1;
1558 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1559 {
1560 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1561 if (ch < 256 && unicode_latin1[ch] == unicode)
1562 return 1;
1563 }
1564 return 0;
1565}
1566#endif
1567
Alexander Belopolsky40018472011-02-26 01:02:56 +00001568static int
Victor Stinner488fa492011-12-12 00:01:39 +01001569unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001570{
Victor Stinner488fa492011-12-12 00:01:39 +01001571 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 if (Py_REFCNT(unicode) != 1)
1573 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001574 if (_PyUnicode_HASH(unicode) != -1)
1575 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 if (PyUnicode_CHECK_INTERNED(unicode))
1577 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001578 if (!PyUnicode_CheckExact(unicode))
1579 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001581 /* singleton refcount is greater than 1 */
1582 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001583#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 1;
1585}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587static int
1588unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1589{
1590 PyObject *unicode;
1591 Py_ssize_t old_length;
1592
1593 assert(p_unicode != NULL);
1594 unicode = *p_unicode;
1595
1596 assert(unicode != NULL);
1597 assert(PyUnicode_Check(unicode));
1598 assert(0 <= length);
1599
Victor Stinner910337b2011-10-03 03:20:16 +02001600 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 old_length = PyUnicode_WSTR_LENGTH(unicode);
1602 else
1603 old_length = PyUnicode_GET_LENGTH(unicode);
1604 if (old_length == length)
1605 return 0;
1606
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001607 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001608 _Py_INCREF_UNICODE_EMPTY();
1609 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611 Py_DECREF(*p_unicode);
1612 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001613 return 0;
1614 }
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 PyObject *copy = resize_copy(unicode, length);
1618 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001619 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 Py_DECREF(*p_unicode);
1621 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001623 }
1624
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001626 PyObject *new_unicode = resize_compact(unicode, length);
1627 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001631 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001632 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633}
1634
Alexander Belopolsky40018472011-02-26 01:02:56 +00001635int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001637{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001638 PyObject *unicode;
1639 if (p_unicode == NULL) {
1640 PyErr_BadInternalCall();
1641 return -1;
1642 }
1643 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001644 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001645 {
1646 PyErr_BadInternalCall();
1647 return -1;
1648 }
1649 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001650}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001651
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001653unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1654 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655{
1656 PyObject *result;
1657 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001658 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1660 return 0;
1661 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1662 maxchar);
1663 if (result == NULL)
1664 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001665 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001666 Py_DECREF(*p_unicode);
1667 *p_unicode = result;
1668 return 0;
1669}
1670
1671static int
1672unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1673 Py_UCS4 ch)
1674{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001675 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001676 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001677 return -1;
1678 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1679 PyUnicode_DATA(*p_unicode),
1680 (*pos)++, ch);
1681 return 0;
1682}
1683
Victor Stinnerc5166102012-02-22 13:55:02 +01001684/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001685
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001686 WARNING: The function doesn't copy the terminating null character and
1687 doesn't check the maximum character (may write a latin1 character in an
1688 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001689static void
1690unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1691 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692{
1693 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1694 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001695 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
1697 switch (kind) {
1698 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001699 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001700 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001701 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 }
1703 case PyUnicode_2BYTE_KIND: {
1704 Py_UCS2 *start = (Py_UCS2 *)data + index;
1705 Py_UCS2 *ucs2 = start;
1706 assert(index <= PyUnicode_GET_LENGTH(unicode));
1707
Victor Stinner184252a2012-06-16 02:57:41 +02001708 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001709 *ucs2 = (Py_UCS2)*str;
1710
1711 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 default: {
1715 Py_UCS4 *start = (Py_UCS4 *)data + index;
1716 Py_UCS4 *ucs4 = start;
1717 assert(kind == PyUnicode_4BYTE_KIND);
1718 assert(index <= PyUnicode_GET_LENGTH(unicode));
1719
Victor Stinner184252a2012-06-16 02:57:41 +02001720 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 *ucs4 = (Py_UCS4)*str;
1722
1723 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 }
1726}
1727
1728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729static PyObject*
1730get_latin1_char(unsigned char ch)
1731{
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001734 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 if (!unicode)
1736 return NULL;
1737 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001738 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 unicode_latin1[ch] = unicode;
1740 }
1741 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001742 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743}
1744
Alexander Belopolsky40018472011-02-26 01:02:56 +00001745PyObject *
1746PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001748 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 Py_UCS4 maxchar = 0;
1750 Py_ssize_t num_surrogates;
1751
1752 if (u == NULL)
1753 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001755 /* If the Unicode data is known at construction time, we can apply
1756 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001759 if (size == 0)
1760 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 /* Single character Unicode objects in the Latin-1 range are
1763 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001764 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 return get_latin1_char((unsigned char)*u);
1766
1767 /* If not empty and not single character, copy the Unicode data
1768 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001769 if (find_maxchar_surrogates(u, u + size,
1770 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 return NULL;
1772
Victor Stinner8faf8212011-12-08 22:14:11 +01001773 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 if (!unicode)
1775 return NULL;
1776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 switch (PyUnicode_KIND(unicode)) {
1778 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001779 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1781 break;
1782 case PyUnicode_2BYTE_KIND:
1783#if Py_UNICODE_SIZE == 2
1784 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1785#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001786 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1788#endif
1789 break;
1790 case PyUnicode_4BYTE_KIND:
1791#if SIZEOF_WCHAR_T == 2
1792 /* This is the only case which has to process surrogates, thus
1793 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001794 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795#else
1796 assert(num_surrogates == 0);
1797 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1798#endif
1799 break;
1800 default:
1801 assert(0 && "Impossible state");
1802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001804 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805}
1806
Alexander Belopolsky40018472011-02-26 01:02:56 +00001807PyObject *
1808PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001809{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001810 if (size < 0) {
1811 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 return NULL;
1814 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001815 if (u != NULL)
1816 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1817 else
1818 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001819}
1820
Alexander Belopolsky40018472011-02-26 01:02:56 +00001821PyObject *
1822PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001823{
1824 size_t size = strlen(u);
1825 if (size > PY_SSIZE_T_MAX) {
1826 PyErr_SetString(PyExc_OverflowError, "input too long");
1827 return NULL;
1828 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001829 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001830}
1831
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001832PyObject *
1833_PyUnicode_FromId(_Py_Identifier *id)
1834{
1835 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001836 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1837 strlen(id->string),
1838 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001839 if (!id->object)
1840 return NULL;
1841 PyUnicode_InternInPlace(&id->object);
1842 assert(!id->next);
1843 id->next = static_strings;
1844 static_strings = id;
1845 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001846 return id->object;
1847}
1848
1849void
1850_PyUnicode_ClearStaticStrings()
1851{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001852 _Py_Identifier *tmp, *s = static_strings;
1853 while (s) {
1854 Py_DECREF(s->object);
1855 s->object = NULL;
1856 tmp = s->next;
1857 s->next = NULL;
1858 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001860 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001861}
1862
Benjamin Peterson0df54292012-03-26 14:50:32 -04001863/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864
Victor Stinnerd3f08822012-05-29 12:57:52 +02001865PyObject*
1866_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001867{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001868 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001869 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001870 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001871#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001872 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001873#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001874 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001875 }
Victor Stinner785938e2011-12-11 20:09:03 +01001876 unicode = PyUnicode_New(size, 127);
1877 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001878 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001879 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1880 assert(_PyUnicode_CheckConsistency(unicode, 1));
1881 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001882}
1883
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001884static Py_UCS4
1885kind_maxchar_limit(unsigned int kind)
1886{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001887 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001888 case PyUnicode_1BYTE_KIND:
1889 return 0x80;
1890 case PyUnicode_2BYTE_KIND:
1891 return 0x100;
1892 case PyUnicode_4BYTE_KIND:
1893 return 0x10000;
1894 default:
1895 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001896 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001897 }
1898}
1899
Victor Stinnere6abb482012-05-02 01:15:40 +02001900Py_LOCAL_INLINE(Py_UCS4)
1901align_maxchar(Py_UCS4 maxchar)
1902{
1903 if (maxchar <= 127)
1904 return 127;
1905 else if (maxchar <= 255)
1906 return 255;
1907 else if (maxchar <= 65535)
1908 return 65535;
1909 else
1910 return MAX_UNICODE;
1911}
1912
Victor Stinner702c7342011-10-05 13:50:52 +02001913static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001914_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001918
Serhiy Storchaka678db842013-01-26 12:16:36 +02001919 if (size == 0)
1920 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001921 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001922 if (size == 1)
1923 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001925 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001926 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 if (!res)
1928 return NULL;
1929 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001930 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001932}
1933
Victor Stinnere57b1c02011-09-28 22:20:48 +02001934static PyObject*
1935_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936{
1937 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001938 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001939
Serhiy Storchaka678db842013-01-26 12:16:36 +02001940 if (size == 0)
1941 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001942 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001943 if (size == 1) {
1944 Py_UCS4 ch = u[0];
1945 if (ch < 256)
1946 return get_latin1_char((unsigned char)ch);
1947
1948 res = PyUnicode_New(1, ch);
1949 if (res == NULL)
1950 return NULL;
1951 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1952 assert(_PyUnicode_CheckConsistency(res, 1));
1953 return res;
1954 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001955
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001957 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 if (!res)
1959 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001960 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001962 else {
1963 _PyUnicode_CONVERT_BYTES(
1964 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1965 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001966 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 return res;
1968}
1969
Victor Stinnere57b1c02011-09-28 22:20:48 +02001970static PyObject*
1971_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972{
1973 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001975
Serhiy Storchaka678db842013-01-26 12:16:36 +02001976 if (size == 0)
1977 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001979 if (size == 1) {
1980 Py_UCS4 ch = u[0];
1981 if (ch < 256)
1982 return get_latin1_char((unsigned char)ch);
1983
1984 res = PyUnicode_New(1, ch);
1985 if (res == NULL)
1986 return NULL;
1987 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1988 assert(_PyUnicode_CheckConsistency(res, 1));
1989 return res;
1990 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001991
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001992 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001993 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!res)
1995 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001996 if (max_char < 256)
1997 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1998 PyUnicode_1BYTE_DATA(res));
1999 else if (max_char < 0x10000)
2000 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2001 PyUnicode_2BYTE_DATA(res));
2002 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002004 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 return res;
2006}
2007
2008PyObject*
2009PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2010{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002011 if (size < 0) {
2012 PyErr_SetString(PyExc_ValueError, "size must be positive");
2013 return NULL;
2014 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002015 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002017 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002019 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002021 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002022 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 PyErr_SetString(PyExc_SystemError, "invalid kind");
2024 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026}
2027
Victor Stinnerece58de2012-04-23 23:36:38 +02002028Py_UCS4
2029_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2030{
2031 enum PyUnicode_Kind kind;
2032 void *startptr, *endptr;
2033
2034 assert(PyUnicode_IS_READY(unicode));
2035 assert(0 <= start);
2036 assert(end <= PyUnicode_GET_LENGTH(unicode));
2037 assert(start <= end);
2038
2039 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2040 return PyUnicode_MAX_CHAR_VALUE(unicode);
2041
2042 if (start == end)
2043 return 127;
2044
Victor Stinner94d558b2012-04-27 22:26:58 +02002045 if (PyUnicode_IS_ASCII(unicode))
2046 return 127;
2047
Victor Stinnerece58de2012-04-23 23:36:38 +02002048 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002049 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002050 endptr = (char *)startptr + end * kind;
2051 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002052 switch(kind) {
2053 case PyUnicode_1BYTE_KIND:
2054 return ucs1lib_find_max_char(startptr, endptr);
2055 case PyUnicode_2BYTE_KIND:
2056 return ucs2lib_find_max_char(startptr, endptr);
2057 case PyUnicode_4BYTE_KIND:
2058 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002059 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002060 assert(0);
2061 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002062 }
2063}
2064
Victor Stinner25a4b292011-10-06 12:31:55 +02002065/* Ensure that a string uses the most efficient storage, if it is not the
2066 case: create a new string with of the right kind. Write NULL into *p_unicode
2067 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002068static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002069unicode_adjust_maxchar(PyObject **p_unicode)
2070{
2071 PyObject *unicode, *copy;
2072 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002073 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002074 unsigned int kind;
2075
2076 assert(p_unicode != NULL);
2077 unicode = *p_unicode;
2078 assert(PyUnicode_IS_READY(unicode));
2079 if (PyUnicode_IS_ASCII(unicode))
2080 return;
2081
2082 len = PyUnicode_GET_LENGTH(unicode);
2083 kind = PyUnicode_KIND(unicode);
2084 if (kind == PyUnicode_1BYTE_KIND) {
2085 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002086 max_char = ucs1lib_find_max_char(u, u + len);
2087 if (max_char >= 128)
2088 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002089 }
2090 else if (kind == PyUnicode_2BYTE_KIND) {
2091 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002092 max_char = ucs2lib_find_max_char(u, u + len);
2093 if (max_char >= 256)
2094 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002095 }
2096 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002097 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002099 max_char = ucs4lib_find_max_char(u, u + len);
2100 if (max_char >= 0x10000)
2101 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002102 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002103 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002104 if (copy != NULL)
2105 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 Py_DECREF(unicode);
2107 *p_unicode = copy;
2108}
2109
Victor Stinner034f6cf2011-09-30 02:26:44 +02002110PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002111_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112{
Victor Stinner87af4f22011-11-21 23:03:47 +01002113 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002114 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002115
Victor Stinner034f6cf2011-09-30 02:26:44 +02002116 if (!PyUnicode_Check(unicode)) {
2117 PyErr_BadInternalCall();
2118 return NULL;
2119 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002120 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002121 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002122
Victor Stinner87af4f22011-11-21 23:03:47 +01002123 length = PyUnicode_GET_LENGTH(unicode);
2124 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002125 if (!copy)
2126 return NULL;
2127 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2128
Victor Stinner87af4f22011-11-21 23:03:47 +01002129 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2130 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002131 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002132 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002133}
2134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136/* Widen Unicode objects to larger buffers. Don't write terminating null
2137 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138
2139void*
2140_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2141{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002142 Py_ssize_t len;
2143 void *result;
2144 unsigned int skind;
2145
Benjamin Petersonbac79492012-01-14 13:34:47 -05002146 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 return NULL;
2148
2149 len = PyUnicode_GET_LENGTH(s);
2150 skind = PyUnicode_KIND(s);
2151 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002152 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 return NULL;
2154 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002155 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002156 case PyUnicode_2BYTE_KIND:
2157 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2158 if (!result)
2159 return PyErr_NoMemory();
2160 assert(skind == PyUnicode_1BYTE_KIND);
2161 _PyUnicode_CONVERT_BYTES(
2162 Py_UCS1, Py_UCS2,
2163 PyUnicode_1BYTE_DATA(s),
2164 PyUnicode_1BYTE_DATA(s) + len,
2165 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 case PyUnicode_4BYTE_KIND:
2168 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2169 if (!result)
2170 return PyErr_NoMemory();
2171 if (skind == PyUnicode_2BYTE_KIND) {
2172 _PyUnicode_CONVERT_BYTES(
2173 Py_UCS2, Py_UCS4,
2174 PyUnicode_2BYTE_DATA(s),
2175 PyUnicode_2BYTE_DATA(s) + len,
2176 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 else {
2179 assert(skind == PyUnicode_1BYTE_KIND);
2180 _PyUnicode_CONVERT_BYTES(
2181 Py_UCS1, Py_UCS4,
2182 PyUnicode_1BYTE_DATA(s),
2183 PyUnicode_1BYTE_DATA(s) + len,
2184 result);
2185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 default:
2188 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 }
Victor Stinner01698042011-10-04 00:04:26 +02002190 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 return NULL;
2192}
2193
2194static Py_UCS4*
2195as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2196 int copy_null)
2197{
2198 int kind;
2199 void *data;
2200 Py_ssize_t len, targetlen;
2201 if (PyUnicode_READY(string) == -1)
2202 return NULL;
2203 kind = PyUnicode_KIND(string);
2204 data = PyUnicode_DATA(string);
2205 len = PyUnicode_GET_LENGTH(string);
2206 targetlen = len;
2207 if (copy_null)
2208 targetlen++;
2209 if (!target) {
2210 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2211 PyErr_NoMemory();
2212 return NULL;
2213 }
2214 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2215 if (!target) {
2216 PyErr_NoMemory();
2217 return NULL;
2218 }
2219 }
2220 else {
2221 if (targetsize < targetlen) {
2222 PyErr_Format(PyExc_SystemError,
2223 "string is longer than the buffer");
2224 if (copy_null && 0 < targetsize)
2225 target[0] = 0;
2226 return NULL;
2227 }
2228 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002229 if (kind == PyUnicode_1BYTE_KIND) {
2230 Py_UCS1 *start = (Py_UCS1 *) data;
2231 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002233 else if (kind == PyUnicode_2BYTE_KIND) {
2234 Py_UCS2 *start = (Py_UCS2 *) data;
2235 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2236 }
2237 else {
2238 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 if (copy_null)
2242 target[len] = 0;
2243 return target;
2244}
2245
2246Py_UCS4*
2247PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2248 int copy_null)
2249{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002250 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 PyErr_BadInternalCall();
2252 return NULL;
2253 }
2254 return as_ucs4(string, target, targetsize, copy_null);
2255}
2256
2257Py_UCS4*
2258PyUnicode_AsUCS4Copy(PyObject *string)
2259{
2260 return as_ucs4(string, NULL, 0, 1);
2261}
2262
2263#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002264
Alexander Belopolsky40018472011-02-26 01:02:56 +00002265PyObject *
2266PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002270 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002271 PyErr_BadInternalCall();
2272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
2274
Martin v. Löwis790465f2008-04-05 20:41:37 +00002275 if (size == -1) {
2276 size = wcslen(w);
2277 }
2278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280}
2281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002283
Walter Dörwald346737f2007-05-31 10:44:43 +00002284static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002285makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2286 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002287{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002288 *fmt++ = '%';
2289 if (width) {
2290 if (zeropad)
2291 *fmt++ = '0';
2292 fmt += sprintf(fmt, "%d", width);
2293 }
2294 if (precision)
2295 fmt += sprintf(fmt, ".%d", precision);
2296 if (longflag)
2297 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002298 else if (longlongflag) {
2299 /* longlongflag should only ever be nonzero on machines with
2300 HAVE_LONG_LONG defined */
2301#ifdef HAVE_LONG_LONG
2302 char *f = PY_FORMAT_LONG_LONG;
2303 while (*f)
2304 *fmt++ = *f++;
2305#else
2306 /* we shouldn't ever get here */
2307 assert(0);
2308 *fmt++ = 'l';
2309#endif
2310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002311 else if (size_tflag) {
2312 char *f = PY_FORMAT_SIZE_T;
2313 while (*f)
2314 *fmt++ = *f++;
2315 }
2316 *fmt++ = c;
2317 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002318}
2319
Victor Stinner96865452011-03-01 23:44:09 +00002320/* helper for PyUnicode_FromFormatV() */
2321
2322static const char*
2323parse_format_flags(const char *f,
2324 int *p_width, int *p_precision,
2325 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2326{
2327 int width, precision, longflag, longlongflag, size_tflag;
2328
2329 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2330 f++;
2331 width = 0;
2332 while (Py_ISDIGIT((unsigned)*f))
2333 width = (width*10) + *f++ - '0';
2334 precision = 0;
2335 if (*f == '.') {
2336 f++;
2337 while (Py_ISDIGIT((unsigned)*f))
2338 precision = (precision*10) + *f++ - '0';
2339 if (*f == '%') {
2340 /* "%.3%s" => f points to "3" */
2341 f--;
2342 }
2343 }
2344 if (*f == '\0') {
2345 /* bogus format "%.1" => go backward, f points to "1" */
2346 f--;
2347 }
2348 if (p_width != NULL)
2349 *p_width = width;
2350 if (p_precision != NULL)
2351 *p_precision = precision;
2352
2353 /* Handle %ld, %lu, %lld and %llu. */
2354 longflag = 0;
2355 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002356 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002357
2358 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002359 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002360 longflag = 1;
2361 ++f;
2362 }
2363#ifdef HAVE_LONG_LONG
2364 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002365 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002366 longlongflag = 1;
2367 f += 2;
2368 }
2369#endif
2370 }
2371 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002372 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002373 size_tflag = 1;
2374 ++f;
2375 }
2376 if (p_longflag != NULL)
2377 *p_longflag = longflag;
2378 if (p_longlongflag != NULL)
2379 *p_longlongflag = longlongflag;
2380 if (p_size_tflag != NULL)
2381 *p_size_tflag = size_tflag;
2382 return f;
2383}
2384
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002385/* maximum number of characters required for output of %ld. 21 characters
2386 allows for 64-bit integers (in decimal) and an optional sign. */
2387#define MAX_LONG_CHARS 21
2388/* maximum number of characters required for output of %lld.
2389 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2390 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2391#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2392
Walter Dörwaldd2034312007-05-18 16:29:38 +00002393PyObject *
2394PyUnicode_FromFormatV(const char *format, va_list vargs)
2395{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002396 va_list count;
2397 Py_ssize_t callcount = 0;
2398 PyObject **callresults = NULL;
2399 PyObject **callresult = NULL;
2400 Py_ssize_t n = 0;
2401 int width = 0;
2402 int precision = 0;
2403 int zeropad;
2404 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002405 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002406 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002407 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2409 Py_UCS4 argmaxchar;
2410 Py_ssize_t numbersize = 0;
2411 char *numberresults = NULL;
2412 char *numberresult = NULL;
2413 Py_ssize_t i;
2414 int kind;
2415 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002416
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002417 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002418 /* step 1: count the number of %S/%R/%A/%s format specifications
2419 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2420 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002422 * also estimate a upper bound for all the number formats in the string,
2423 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002425 for (f = format; *f; f++) {
2426 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002427 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2429 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2430 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2431 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002434#ifdef HAVE_LONG_LONG
2435 if (longlongflag) {
2436 if (width < MAX_LONG_LONG_CHARS)
2437 width = MAX_LONG_LONG_CHARS;
2438 }
2439 else
2440#endif
2441 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2442 including sign. Decimal takes the most space. This
2443 isn't enough for octal. If a width is specified we
2444 need more (which we allocate later). */
2445 if (width < MAX_LONG_CHARS)
2446 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447
2448 /* account for the size + '\0' to separate numbers
2449 inside of the numberresults buffer */
2450 numbersize += (width + 1);
2451 }
2452 }
2453 else if ((unsigned char)*f > 127) {
2454 PyErr_Format(PyExc_ValueError,
2455 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2456 "string, got a non-ASCII byte: 0x%02x",
2457 (unsigned char)*f);
2458 return NULL;
2459 }
2460 }
2461 /* step 2: allocate memory for the results of
2462 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2463 if (callcount) {
2464 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2465 if (!callresults) {
2466 PyErr_NoMemory();
2467 return NULL;
2468 }
2469 callresult = callresults;
2470 }
2471 /* step 2.5: allocate memory for the results of formating numbers */
2472 if (numbersize) {
2473 numberresults = PyObject_Malloc(numbersize);
2474 if (!numberresults) {
2475 PyErr_NoMemory();
2476 goto fail;
2477 }
2478 numberresult = numberresults;
2479 }
2480
2481 /* step 3: format numbers and figure out how large a buffer we need */
2482 for (f = format; *f; f++) {
2483 if (*f == '%') {
2484 const char* p;
2485 int longflag;
2486 int longlongflag;
2487 int size_tflag;
2488 int numprinted;
2489
2490 p = f;
2491 zeropad = (f[1] == '0');
2492 f = parse_format_flags(f, &width, &precision,
2493 &longflag, &longlongflag, &size_tflag);
2494 switch (*f) {
2495 case 'c':
2496 {
2497 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002498 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n++;
2500 break;
2501 }
2502 case '%':
2503 n++;
2504 break;
2505 case 'i':
2506 case 'd':
2507 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2508 width, precision, *f);
2509 if (longflag)
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, long));
2512#ifdef HAVE_LONG_LONG
2513 else if (longlongflag)
2514 numprinted = sprintf(numberresult, fmt,
2515 va_arg(count, PY_LONG_LONG));
2516#endif
2517 else if (size_tflag)
2518 numprinted = sprintf(numberresult, fmt,
2519 va_arg(count, Py_ssize_t));
2520 else
2521 numprinted = sprintf(numberresult, fmt,
2522 va_arg(count, int));
2523 n += numprinted;
2524 /* advance by +1 to skip over the '\0' */
2525 numberresult += (numprinted + 1);
2526 assert(*(numberresult - 1) == '\0');
2527 assert(*(numberresult - 2) != '\0');
2528 assert(numprinted >= 0);
2529 assert(numberresult <= numberresults + numbersize);
2530 break;
2531 case 'u':
2532 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2533 width, precision, 'u');
2534 if (longflag)
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned long));
2537#ifdef HAVE_LONG_LONG
2538 else if (longlongflag)
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, unsigned PY_LONG_LONG));
2541#endif
2542 else if (size_tflag)
2543 numprinted = sprintf(numberresult, fmt,
2544 va_arg(count, size_t));
2545 else
2546 numprinted = sprintf(numberresult, fmt,
2547 va_arg(count, unsigned int));
2548 n += numprinted;
2549 numberresult += (numprinted + 1);
2550 assert(*(numberresult - 1) == '\0');
2551 assert(*(numberresult - 2) != '\0');
2552 assert(numprinted >= 0);
2553 assert(numberresult <= numberresults + numbersize);
2554 break;
2555 case 'x':
2556 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2557 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2558 n += numprinted;
2559 numberresult += (numprinted + 1);
2560 assert(*(numberresult - 1) == '\0');
2561 assert(*(numberresult - 2) != '\0');
2562 assert(numprinted >= 0);
2563 assert(numberresult <= numberresults + numbersize);
2564 break;
2565 case 'p':
2566 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2567 /* %p is ill-defined: ensure leading 0x. */
2568 if (numberresult[1] == 'X')
2569 numberresult[1] = 'x';
2570 else if (numberresult[1] != 'x') {
2571 memmove(numberresult + 2, numberresult,
2572 strlen(numberresult) + 1);
2573 numberresult[0] = '0';
2574 numberresult[1] = 'x';
2575 numprinted += 2;
2576 }
2577 n += numprinted;
2578 numberresult += (numprinted + 1);
2579 assert(*(numberresult - 1) == '\0');
2580 assert(*(numberresult - 2) != '\0');
2581 assert(numprinted >= 0);
2582 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 case 's':
2585 {
2586 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002587 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002588 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002589 if (!str)
2590 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 /* since PyUnicode_DecodeUTF8 returns already flexible
2592 unicode objects, there is no need to call ready on them */
2593 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002594 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 /* Remember the str and switch to the next slot */
2597 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 break;
2599 }
2600 case 'U':
2601 {
2602 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002603 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 if (PyUnicode_READY(obj) == -1)
2605 goto fail;
2606 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002607 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 break;
2610 }
2611 case 'V':
2612 {
2613 PyObject *obj = va_arg(count, PyObject *);
2614 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002615 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002617 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002618 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 if (PyUnicode_READY(obj) == -1)
2620 goto fail;
2621 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002622 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002624 *callresult++ = NULL;
2625 }
2626 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002627 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002628 if (!str_obj)
2629 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002630 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002631 Py_DECREF(str_obj);
2632 goto fail;
2633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002635 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002637 *callresult++ = str_obj;
2638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 break;
2640 }
2641 case 'S':
2642 {
2643 PyObject *obj = va_arg(count, PyObject *);
2644 PyObject *str;
2645 assert(obj);
2646 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002647 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002649 if (PyUnicode_READY(str) == -1) {
2650 Py_DECREF(str);
2651 goto fail;
2652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002654 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* Remember the str and switch to the next slot */
2657 *callresult++ = str;
2658 break;
2659 }
2660 case 'R':
2661 {
2662 PyObject *obj = va_arg(count, PyObject *);
2663 PyObject *repr;
2664 assert(obj);
2665 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002666 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002668 if (PyUnicode_READY(repr) == -1) {
2669 Py_DECREF(repr);
2670 goto fail;
2671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002673 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 /* Remember the repr and switch to the next slot */
2676 *callresult++ = repr;
2677 break;
2678 }
2679 case 'A':
2680 {
2681 PyObject *obj = va_arg(count, PyObject *);
2682 PyObject *ascii;
2683 assert(obj);
2684 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002685 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002687 if (PyUnicode_READY(ascii) == -1) {
2688 Py_DECREF(ascii);
2689 goto fail;
2690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002692 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 /* Remember the repr and switch to the next slot */
2695 *callresult++ = ascii;
2696 break;
2697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 default:
2699 /* if we stumble upon an unknown
2700 formatting code, copy the rest of
2701 the format string to the output
2702 string. (we cannot just skip the
2703 code, since there's no way to know
2704 what's in the argument list) */
2705 n += strlen(p);
2706 goto expand;
2707 }
2708 } else
2709 n++;
2710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 we don't have to resize the string.
2715 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002716 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 if (!string)
2718 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719 kind = PyUnicode_KIND(string);
2720 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002726 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002727
2728 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2730 /* checking for == because the last argument could be a empty
2731 string, which causes i to point to end, the assert at the end of
2732 the loop */
2733 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002734
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 switch (*f) {
2736 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002737 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 const int ordinal = va_arg(vargs, int);
2739 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002741 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002742 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002747 {
Victor Stinner184252a2012-06-16 02:57:41 +02002748 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 /* unused, since we already have the result */
2750 if (*f == 'p')
2751 (void) va_arg(vargs, void *);
2752 else
2753 (void) va_arg(vargs, int);
2754 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002755 len = strlen(numberresult);
2756 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002758 i += len;
2759 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 assert(*numberresult == '\0');
2761 numberresult++;
2762 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002763 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002764 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002765 case 's':
2766 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002767 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002769 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 size = PyUnicode_GET_LENGTH(*callresult);
2771 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002772 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002774 /* We're done with the unicode()/repr() => forget it */
2775 Py_DECREF(*callresult);
2776 /* switch to next unicode()/repr() result */
2777 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 break;
2779 }
2780 case 'U':
2781 {
2782 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 Py_ssize_t size;
2784 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2785 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002786 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 break;
2789 }
2790 case 'V':
2791 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002794 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002795 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796 size = PyUnicode_GET_LENGTH(obj);
2797 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002798 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002800 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 size = PyUnicode_GET_LENGTH(*callresult);
2802 assert(PyUnicode_KIND(*callresult) <=
2803 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002804 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002806 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002807 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002808 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 break;
2810 }
2811 case 'S':
2812 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002813 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002814 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002815 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002816 /* unused, since we already have the result */
2817 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002819 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002820 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002821 /* We're done with the unicode()/repr() => forget it */
2822 Py_DECREF(*callresult);
2823 /* switch to next unicode()/repr() result */
2824 ++callresult;
2825 break;
2826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002829 break;
2830 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002831 {
2832 Py_ssize_t len = strlen(p);
2833 unicode_write_cstr(string, i, p, len);
2834 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002836 goto end;
2837 }
Victor Stinner184252a2012-06-16 02:57:41 +02002838 }
Victor Stinner1205f272010-09-11 00:54:47 +00002839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840 else {
2841 assert(i < PyUnicode_GET_LENGTH(string));
2842 PyUnicode_WRITE(kind, data, i++, *f);
2843 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002845 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002846
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002848 if (callresults)
2849 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 if (numberresults)
2851 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002852 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 if (callresults) {
2855 PyObject **callresult2 = callresults;
2856 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002857 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 ++callresult2;
2859 }
2860 PyObject_Free(callresults);
2861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002862 if (numberresults)
2863 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865}
2866
Walter Dörwaldd2034312007-05-18 16:29:38 +00002867PyObject *
2868PyUnicode_FromFormat(const char *format, ...)
2869{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 PyObject* ret;
2871 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
2873#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002875#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002876 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002878 ret = PyUnicode_FromFormatV(format, vargs);
2879 va_end(vargs);
2880 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881}
2882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002883#ifdef HAVE_WCHAR_H
2884
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2886 convert a Unicode object to a wide character string.
2887
Victor Stinnerd88d9832011-09-06 02:00:05 +02002888 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002889 character) required to convert the unicode object. Ignore size argument.
2890
Victor Stinnerd88d9832011-09-06 02:00:05 +02002891 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002892 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002893 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002894static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002895unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002896 wchar_t *w,
2897 Py_ssize_t size)
2898{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002899 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 const wchar_t *wstr;
2901
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002902 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 if (wstr == NULL)
2904 return -1;
2905
Victor Stinner5593d8a2010-10-02 11:11:27 +00002906 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002907 if (size > res)
2908 size = res + 1;
2909 else
2910 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002912 return res;
2913 }
2914 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002915 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002916}
2917
2918Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002919PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002920 wchar_t *w,
2921 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922{
2923 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 PyErr_BadInternalCall();
2925 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002927 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928}
2929
Victor Stinner137c34c2010-09-29 10:25:54 +00002930wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002931PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002932 Py_ssize_t *size)
2933{
2934 wchar_t* buffer;
2935 Py_ssize_t buflen;
2936
2937 if (unicode == NULL) {
2938 PyErr_BadInternalCall();
2939 return NULL;
2940 }
2941
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002942 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943 if (buflen == -1)
2944 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002946 PyErr_NoMemory();
2947 return NULL;
2948 }
2949
Victor Stinner137c34c2010-09-29 10:25:54 +00002950 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2951 if (buffer == NULL) {
2952 PyErr_NoMemory();
2953 return NULL;
2954 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002956 if (buflen == -1) {
2957 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002959 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (size != NULL)
2961 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002962 return buffer;
2963}
2964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
Alexander Belopolsky40018472011-02-26 01:02:56 +00002967PyObject *
2968PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002971 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 PyErr_SetString(PyExc_ValueError,
2973 "chr() arg not in range(0x110000)");
2974 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002975 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002976
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002977 if ((Py_UCS4)ordinal < 256)
2978 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002980 v = PyUnicode_New(1, ordinal);
2981 if (v == NULL)
2982 return NULL;
2983 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002984 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002985 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002991 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002993 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002994 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002995 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 Py_INCREF(obj);
2997 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002998 }
2999 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 /* For a Unicode subtype that's not a Unicode object,
3001 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003002 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003003 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003004 PyErr_Format(PyExc_TypeError,
3005 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003006 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003007 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003008}
3009
Alexander Belopolsky40018472011-02-26 01:02:56 +00003010PyObject *
3011PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003012 const char *encoding,
3013 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003014{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003015 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003016 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003017
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 PyErr_BadInternalCall();
3020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003022
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003023 /* Decoding bytes objects is the most common case and should be fast */
3024 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003025 if (PyBytes_GET_SIZE(obj) == 0)
3026 _Py_RETURN_UNICODE_EMPTY();
3027 v = PyUnicode_Decode(
3028 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3029 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003030 return v;
3031 }
3032
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003033 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 PyErr_SetString(PyExc_TypeError,
3035 "decoding str is not supported");
3036 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003037 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003038
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003039 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3040 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3041 PyErr_Format(PyExc_TypeError,
3042 "coercing to str: need bytes, bytearray "
3043 "or buffer-like object, %.80s found",
3044 Py_TYPE(obj)->tp_name);
3045 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003046 }
Tim Petersced69f82003-09-16 20:30:58 +00003047
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003048 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003049 PyBuffer_Release(&buffer);
3050 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003052
Serhiy Storchaka05997252013-01-26 12:14:02 +02003053 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056}
3057
Victor Stinner600d3be2010-06-10 12:00:55 +00003058/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003059 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3060 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003061int
3062_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003063 char *lower,
3064 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003066 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003067 char *l;
3068 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003070 if (encoding == NULL) {
3071 strcpy(lower, "utf-8");
3072 return 1;
3073 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003074 e = encoding;
3075 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003076 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003077 while (*e) {
3078 if (l == l_end)
3079 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003080 if (Py_ISUPPER(*e)) {
3081 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003082 }
3083 else if (*e == '_') {
3084 *l++ = '-';
3085 e++;
3086 }
3087 else {
3088 *l++ = *e++;
3089 }
3090 }
3091 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003092 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003093}
3094
Alexander Belopolsky40018472011-02-26 01:02:56 +00003095PyObject *
3096PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003097 Py_ssize_t size,
3098 const char *encoding,
3099 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003100{
3101 PyObject *buffer = NULL, *unicode;
3102 Py_buffer info;
3103 char lower[11]; /* Enough for any encoding shortcut */
3104
Fred Drakee4315f52000-05-09 19:53:39 +00003105 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003106 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003107 if ((strcmp(lower, "utf-8") == 0) ||
3108 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003109 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003110 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003111 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003112 (strcmp(lower, "iso-8859-1") == 0))
3113 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003114#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003115 else if (strcmp(lower, "mbcs") == 0)
3116 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003117#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003118 else if (strcmp(lower, "ascii") == 0)
3119 return PyUnicode_DecodeASCII(s, size, errors);
3120 else if (strcmp(lower, "utf-16") == 0)
3121 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3122 else if (strcmp(lower, "utf-32") == 0)
3123 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003127 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003128 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003129 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003130 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 if (buffer == NULL)
3132 goto onError;
3133 unicode = PyCodec_Decode(buffer, encoding, errors);
3134 if (unicode == NULL)
3135 goto onError;
3136 if (!PyUnicode_Check(unicode)) {
3137 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003138 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003139 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 Py_DECREF(unicode);
3141 goto onError;
3142 }
3143 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003144 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003145
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 Py_XDECREF(buffer);
3148 return NULL;
3149}
3150
Alexander Belopolsky40018472011-02-26 01:02:56 +00003151PyObject *
3152PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003153 const char *encoding,
3154 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003155{
3156 PyObject *v;
3157
3158 if (!PyUnicode_Check(unicode)) {
3159 PyErr_BadArgument();
3160 goto onError;
3161 }
3162
3163 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003165
3166 /* Decode via the codec registry */
3167 v = PyCodec_Decode(unicode, encoding, errors);
3168 if (v == NULL)
3169 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003170 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003171
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003173 return NULL;
3174}
3175
Alexander Belopolsky40018472011-02-26 01:02:56 +00003176PyObject *
3177PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003178 const char *encoding,
3179 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003180{
3181 PyObject *v;
3182
3183 if (!PyUnicode_Check(unicode)) {
3184 PyErr_BadArgument();
3185 goto onError;
3186 }
3187
3188 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190
3191 /* Decode via the codec registry */
3192 v = PyCodec_Decode(unicode, encoding, errors);
3193 if (v == NULL)
3194 goto onError;
3195 if (!PyUnicode_Check(v)) {
3196 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003197 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003198 Py_TYPE(v)->tp_name);
3199 Py_DECREF(v);
3200 goto onError;
3201 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003202 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003205 return NULL;
3206}
3207
Alexander Belopolsky40018472011-02-26 01:02:56 +00003208PyObject *
3209PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003210 Py_ssize_t size,
3211 const char *encoding,
3212 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
3214 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 unicode = PyUnicode_FromUnicode(s, size);
3217 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3220 Py_DECREF(unicode);
3221 return v;
3222}
3223
Alexander Belopolsky40018472011-02-26 01:02:56 +00003224PyObject *
3225PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003226 const char *encoding,
3227 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003228{
3229 PyObject *v;
3230
3231 if (!PyUnicode_Check(unicode)) {
3232 PyErr_BadArgument();
3233 goto onError;
3234 }
3235
3236 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238
3239 /* Encode via the codec registry */
3240 v = PyCodec_Encode(unicode, encoding, errors);
3241 if (v == NULL)
3242 goto onError;
3243 return v;
3244
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246 return NULL;
3247}
3248
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003249static size_t
3250wcstombs_errorpos(const wchar_t *wstr)
3251{
3252 size_t len;
3253#if SIZEOF_WCHAR_T == 2
3254 wchar_t buf[3];
3255#else
3256 wchar_t buf[2];
3257#endif
3258 char outbuf[MB_LEN_MAX];
3259 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003261#if SIZEOF_WCHAR_T == 2
3262 buf[2] = 0;
3263#else
3264 buf[1] = 0;
3265#endif
3266 start = wstr;
3267 while (*wstr != L'\0')
3268 {
3269 previous = wstr;
3270#if SIZEOF_WCHAR_T == 2
3271 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3272 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3273 {
3274 buf[0] = wstr[0];
3275 buf[1] = wstr[1];
3276 wstr += 2;
3277 }
3278 else {
3279 buf[0] = *wstr;
3280 buf[1] = 0;
3281 wstr++;
3282 }
3283#else
3284 buf[0] = *wstr;
3285 wstr++;
3286#endif
3287 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003288 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003289 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 }
3291
3292 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003293 return 0;
3294}
3295
Victor Stinner1b579672011-12-17 05:47:23 +01003296static int
3297locale_error_handler(const char *errors, int *surrogateescape)
3298{
3299 if (errors == NULL) {
3300 *surrogateescape = 0;
3301 return 0;
3302 }
3303
3304 if (strcmp(errors, "strict") == 0) {
3305 *surrogateescape = 0;
3306 return 0;
3307 }
3308 if (strcmp(errors, "surrogateescape") == 0) {
3309 *surrogateescape = 1;
3310 return 0;
3311 }
3312 PyErr_Format(PyExc_ValueError,
3313 "only 'strict' and 'surrogateescape' error handlers "
3314 "are supported, not '%s'",
3315 errors);
3316 return -1;
3317}
3318
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003319PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003320PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003321{
3322 Py_ssize_t wlen, wlen2;
3323 wchar_t *wstr;
3324 PyObject *bytes = NULL;
3325 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003326 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003327 PyObject *exc;
3328 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003329 int surrogateescape;
3330
3331 if (locale_error_handler(errors, &surrogateescape) < 0)
3332 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333
3334 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3335 if (wstr == NULL)
3336 return NULL;
3337
3338 wlen2 = wcslen(wstr);
3339 if (wlen2 != wlen) {
3340 PyMem_Free(wstr);
3341 PyErr_SetString(PyExc_TypeError, "embedded null character");
3342 return NULL;
3343 }
3344
3345 if (surrogateescape) {
3346 /* locale encoding with surrogateescape */
3347 char *str;
3348
3349 str = _Py_wchar2char(wstr, &error_pos);
3350 if (str == NULL) {
3351 if (error_pos == (size_t)-1) {
3352 PyErr_NoMemory();
3353 PyMem_Free(wstr);
3354 return NULL;
3355 }
3356 else {
3357 goto encode_error;
3358 }
3359 }
3360 PyMem_Free(wstr);
3361
3362 bytes = PyBytes_FromString(str);
3363 PyMem_Free(str);
3364 }
3365 else {
3366 size_t len, len2;
3367
3368 len = wcstombs(NULL, wstr, 0);
3369 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003370 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371 goto encode_error;
3372 }
3373
3374 bytes = PyBytes_FromStringAndSize(NULL, len);
3375 if (bytes == NULL) {
3376 PyMem_Free(wstr);
3377 return NULL;
3378 }
3379
3380 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3381 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003382 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 goto encode_error;
3384 }
3385 PyMem_Free(wstr);
3386 }
3387 return bytes;
3388
3389encode_error:
3390 errmsg = strerror(errno);
3391 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003392
3393 if (error_pos == (size_t)-1)
3394 error_pos = wcstombs_errorpos(wstr);
3395
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003396 PyMem_Free(wstr);
3397 Py_XDECREF(bytes);
3398
Victor Stinner2f197072011-12-17 07:08:30 +01003399 if (errmsg != NULL) {
3400 size_t errlen;
3401 wstr = _Py_char2wchar(errmsg, &errlen);
3402 if (wstr != NULL) {
3403 reason = PyUnicode_FromWideChar(wstr, errlen);
3404 PyMem_Free(wstr);
3405 } else
3406 errmsg = NULL;
3407 }
3408 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003409 reason = PyUnicode_FromString(
3410 "wcstombs() encountered an unencodable "
3411 "wide character");
3412 if (reason == NULL)
3413 return NULL;
3414
3415 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3416 "locale", unicode,
3417 (Py_ssize_t)error_pos,
3418 (Py_ssize_t)(error_pos+1),
3419 reason);
3420 Py_DECREF(reason);
3421 if (exc != NULL) {
3422 PyCodec_StrictErrors(exc);
3423 Py_XDECREF(exc);
3424 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 return NULL;
3426}
3427
Victor Stinnerad158722010-10-27 00:25:46 +00003428PyObject *
3429PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003430{
Victor Stinner99b95382011-07-04 14:23:54 +02003431#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003432 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003433#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003434 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003435#else
Victor Stinner793b5312011-04-27 00:24:21 +02003436 PyInterpreterState *interp = PyThreadState_GET()->interp;
3437 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3438 cannot use it to encode and decode filenames before it is loaded. Load
3439 the Python codec requires to encode at least its own filename. Use the C
3440 version of the locale codec until the codec registry is initialized and
3441 the Python codec is loaded.
3442
3443 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3444 cannot only rely on it: check also interp->fscodec_initialized for
3445 subinterpreters. */
3446 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003447 return PyUnicode_AsEncodedString(unicode,
3448 Py_FileSystemDefaultEncoding,
3449 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003450 }
3451 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003452 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003453 }
Victor Stinnerad158722010-10-27 00:25:46 +00003454#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003455}
3456
Alexander Belopolsky40018472011-02-26 01:02:56 +00003457PyObject *
3458PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003459 const char *encoding,
3460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461{
3462 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003463 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003464
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 if (!PyUnicode_Check(unicode)) {
3466 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
Fred Drakee4315f52000-05-09 19:53:39 +00003469
Fred Drakee4315f52000-05-09 19:53:39 +00003470 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003471 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003472 if ((strcmp(lower, "utf-8") == 0) ||
3473 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003474 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003475 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003477 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003478 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003479 }
Victor Stinner37296e82010-06-10 13:36:23 +00003480 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003481 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003482 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003483 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003484#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003485 else if (strcmp(lower, "mbcs") == 0)
3486 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003487#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003488 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003489 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491
3492 /* Encode via the codec registry */
3493 v = PyCodec_Encode(unicode, encoding, errors);
3494 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003495 return NULL;
3496
3497 /* The normal path */
3498 if (PyBytes_Check(v))
3499 return v;
3500
3501 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003502 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003503 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003504 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003505
3506 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3507 "encoder %s returned bytearray instead of bytes",
3508 encoding);
3509 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003510 Py_DECREF(v);
3511 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003512 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003514 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3515 Py_DECREF(v);
3516 return b;
3517 }
3518
3519 PyErr_Format(PyExc_TypeError,
3520 "encoder did not return a bytes object (type=%.400s)",
3521 Py_TYPE(v)->tp_name);
3522 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003523 return NULL;
3524}
3525
Alexander Belopolsky40018472011-02-26 01:02:56 +00003526PyObject *
3527PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003528 const char *encoding,
3529 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003530{
3531 PyObject *v;
3532
3533 if (!PyUnicode_Check(unicode)) {
3534 PyErr_BadArgument();
3535 goto onError;
3536 }
3537
3538 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003540
3541 /* Encode via the codec registry */
3542 v = PyCodec_Encode(unicode, encoding, errors);
3543 if (v == NULL)
3544 goto onError;
3545 if (!PyUnicode_Check(v)) {
3546 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003547 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003548 Py_TYPE(v)->tp_name);
3549 Py_DECREF(v);
3550 goto onError;
3551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003553
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 return NULL;
3556}
3557
Victor Stinner2f197072011-12-17 07:08:30 +01003558static size_t
3559mbstowcs_errorpos(const char *str, size_t len)
3560{
3561#ifdef HAVE_MBRTOWC
3562 const char *start = str;
3563 mbstate_t mbs;
3564 size_t converted;
3565 wchar_t ch;
3566
3567 memset(&mbs, 0, sizeof mbs);
3568 while (len)
3569 {
3570 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3571 if (converted == 0)
3572 /* Reached end of string */
3573 break;
3574 if (converted == (size_t)-1 || converted == (size_t)-2) {
3575 /* Conversion error or incomplete character */
3576 return str - start;
3577 }
3578 else {
3579 str += converted;
3580 len -= converted;
3581 }
3582 }
3583 /* failed to find the undecodable byte sequence */
3584 return 0;
3585#endif
3586 return 0;
3587}
3588
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003589PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003590PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003591 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003592{
3593 wchar_t smallbuf[256];
3594 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3595 wchar_t *wstr;
3596 size_t wlen, wlen2;
3597 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003598 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003599 size_t error_pos;
3600 char *errmsg;
3601 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003602
3603 if (locale_error_handler(errors, &surrogateescape) < 0)
3604 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003605
3606 if (str[len] != '\0' || len != strlen(str)) {
3607 PyErr_SetString(PyExc_TypeError, "embedded null character");
3608 return NULL;
3609 }
3610
3611 if (surrogateescape)
3612 {
3613 wstr = _Py_char2wchar(str, &wlen);
3614 if (wstr == NULL) {
3615 if (wlen == (size_t)-1)
3616 PyErr_NoMemory();
3617 else
3618 PyErr_SetFromErrno(PyExc_OSError);
3619 return NULL;
3620 }
3621
3622 unicode = PyUnicode_FromWideChar(wstr, wlen);
3623 PyMem_Free(wstr);
3624 }
3625 else {
3626#ifndef HAVE_BROKEN_MBSTOWCS
3627 wlen = mbstowcs(NULL, str, 0);
3628#else
3629 wlen = len;
3630#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003631 if (wlen == (size_t)-1)
3632 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003633 if (wlen+1 <= smallbuf_len) {
3634 wstr = smallbuf;
3635 }
3636 else {
3637 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3638 return PyErr_NoMemory();
3639
3640 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3641 if (!wstr)
3642 return PyErr_NoMemory();
3643 }
3644
3645 /* This shouldn't fail now */
3646 wlen2 = mbstowcs(wstr, str, wlen+1);
3647 if (wlen2 == (size_t)-1) {
3648 if (wstr != smallbuf)
3649 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003650 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003651 }
3652#ifdef HAVE_BROKEN_MBSTOWCS
3653 assert(wlen2 == wlen);
3654#endif
3655 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3656 if (wstr != smallbuf)
3657 PyMem_Free(wstr);
3658 }
3659 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003660
3661decode_error:
3662 errmsg = strerror(errno);
3663 assert(errmsg != NULL);
3664
3665 error_pos = mbstowcs_errorpos(str, len);
3666 if (errmsg != NULL) {
3667 size_t errlen;
3668 wstr = _Py_char2wchar(errmsg, &errlen);
3669 if (wstr != NULL) {
3670 reason = PyUnicode_FromWideChar(wstr, errlen);
3671 PyMem_Free(wstr);
3672 } else
3673 errmsg = NULL;
3674 }
3675 if (errmsg == NULL)
3676 reason = PyUnicode_FromString(
3677 "mbstowcs() encountered an invalid multibyte sequence");
3678 if (reason == NULL)
3679 return NULL;
3680
3681 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3682 "locale", str, len,
3683 (Py_ssize_t)error_pos,
3684 (Py_ssize_t)(error_pos+1),
3685 reason);
3686 Py_DECREF(reason);
3687 if (exc != NULL) {
3688 PyCodec_StrictErrors(exc);
3689 Py_XDECREF(exc);
3690 }
3691 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003692}
3693
3694PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003695PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003696{
3697 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003698 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003699}
3700
3701
3702PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003703PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003704 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003705 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3706}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003707
Christian Heimes5894ba72007-11-04 11:43:14 +00003708PyObject*
3709PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3710{
Victor Stinner99b95382011-07-04 14:23:54 +02003711#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003712 return PyUnicode_DecodeMBCS(s, size, NULL);
3713#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003714 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003715#else
Victor Stinner793b5312011-04-27 00:24:21 +02003716 PyInterpreterState *interp = PyThreadState_GET()->interp;
3717 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3718 cannot use it to encode and decode filenames before it is loaded. Load
3719 the Python codec requires to encode at least its own filename. Use the C
3720 version of the locale codec until the codec registry is initialized and
3721 the Python codec is loaded.
3722
3723 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3724 cannot only rely on it: check also interp->fscodec_initialized for
3725 subinterpreters. */
3726 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727 return PyUnicode_Decode(s, size,
3728 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003729 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003730 }
3731 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003732 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003733 }
Victor Stinnerad158722010-10-27 00:25:46 +00003734#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003735}
3736
Martin v. Löwis011e8422009-05-05 04:43:17 +00003737
3738int
Antoine Pitrou13348842012-01-29 18:36:34 +01003739_PyUnicode_HasNULChars(PyObject* s)
3740{
3741 static PyObject *nul = NULL;
3742
3743 if (nul == NULL)
3744 nul = PyUnicode_FromStringAndSize("\0", 1);
3745 if (nul == NULL)
3746 return -1;
3747 return PyUnicode_Contains(s, nul);
3748}
3749
3750
3751int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003752PyUnicode_FSConverter(PyObject* arg, void* addr)
3753{
3754 PyObject *output = NULL;
3755 Py_ssize_t size;
3756 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003757 if (arg == NULL) {
3758 Py_DECREF(*(PyObject**)addr);
3759 return 1;
3760 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003761 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003762 output = arg;
3763 Py_INCREF(output);
3764 }
3765 else {
3766 arg = PyUnicode_FromObject(arg);
3767 if (!arg)
3768 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003769 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770 Py_DECREF(arg);
3771 if (!output)
3772 return 0;
3773 if (!PyBytes_Check(output)) {
3774 Py_DECREF(output);
3775 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3776 return 0;
3777 }
3778 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003779 size = PyBytes_GET_SIZE(output);
3780 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003781 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003782 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003783 Py_DECREF(output);
3784 return 0;
3785 }
3786 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003787 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003788}
3789
3790
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003791int
3792PyUnicode_FSDecoder(PyObject* arg, void* addr)
3793{
3794 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003795 if (arg == NULL) {
3796 Py_DECREF(*(PyObject**)addr);
3797 return 1;
3798 }
3799 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003800 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003802 output = arg;
3803 Py_INCREF(output);
3804 }
3805 else {
3806 arg = PyBytes_FromObject(arg);
3807 if (!arg)
3808 return 0;
3809 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3810 PyBytes_GET_SIZE(arg));
3811 Py_DECREF(arg);
3812 if (!output)
3813 return 0;
3814 if (!PyUnicode_Check(output)) {
3815 Py_DECREF(output);
3816 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3817 return 0;
3818 }
3819 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003820 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003821 Py_DECREF(output);
3822 return 0;
3823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003825 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003826 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3827 Py_DECREF(output);
3828 return 0;
3829 }
3830 *(PyObject**)addr = output;
3831 return Py_CLEANUP_SUPPORTED;
3832}
3833
3834
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003837{
Christian Heimesf3863112007-11-22 07:46:41 +00003838 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003840 if (!PyUnicode_Check(unicode)) {
3841 PyErr_BadArgument();
3842 return NULL;
3843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003845 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003847 if (PyUnicode_UTF8(unicode) == NULL) {
3848 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3850 if (bytes == NULL)
3851 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003852 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3853 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 Py_DECREF(bytes);
3855 return NULL;
3856 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003857 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3858 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3859 PyBytes_AS_STRING(bytes),
3860 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 Py_DECREF(bytes);
3862 }
3863
3864 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003865 *psize = PyUnicode_UTF8_LENGTH(unicode);
3866 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003867}
3868
3869char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3873}
3874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875Py_UNICODE *
3876PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003878 const unsigned char *one_byte;
3879#if SIZEOF_WCHAR_T == 4
3880 const Py_UCS2 *two_bytes;
3881#else
3882 const Py_UCS4 *four_bytes;
3883 const Py_UCS4 *ucs4_end;
3884 Py_ssize_t num_surrogates;
3885#endif
3886 wchar_t *w;
3887 wchar_t *wchar_end;
3888
3889 if (!PyUnicode_Check(unicode)) {
3890 PyErr_BadArgument();
3891 return NULL;
3892 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003893 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003895 assert(_PyUnicode_KIND(unicode) != 0);
3896 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3901 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902 num_surrogates = 0;
3903
3904 for (; four_bytes < ucs4_end; ++four_bytes) {
3905 if (*four_bytes > 0xFFFF)
3906 ++num_surrogates;
3907 }
3908
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003909 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3910 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3911 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 PyErr_NoMemory();
3913 return NULL;
3914 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003915 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 w = _PyUnicode_WSTR(unicode);
3918 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3919 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3921 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003922 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003924 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3925 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 }
3927 else
3928 *w = *four_bytes;
3929
3930 if (w > wchar_end) {
3931 assert(0 && "Miscalculated string end");
3932 }
3933 }
3934 *w = 0;
3935#else
3936 /* sizeof(wchar_t) == 4 */
3937 Py_FatalError("Impossible unicode object state, wstr and str "
3938 "should share memory already.");
3939 return NULL;
3940#endif
3941 }
3942 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3944 (_PyUnicode_LENGTH(unicode) + 1));
3945 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 PyErr_NoMemory();
3947 return NULL;
3948 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003949 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3950 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3951 w = _PyUnicode_WSTR(unicode);
3952 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003954 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3955 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 for (; w < wchar_end; ++one_byte, ++w)
3957 *w = *one_byte;
3958 /* null-terminate the wstr */
3959 *w = 0;
3960 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003963 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 for (; w < wchar_end; ++two_bytes, ++w)
3965 *w = *two_bytes;
3966 /* null-terminate the wstr */
3967 *w = 0;
3968#else
3969 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003970 PyObject_FREE(_PyUnicode_WSTR(unicode));
3971 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 Py_FatalError("Impossible unicode object state, wstr "
3973 "and str should share memory already.");
3974 return NULL;
3975#endif
3976 }
3977 else {
3978 assert(0 && "This should never happen.");
3979 }
3980 }
3981 }
3982 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003983 *size = PyUnicode_WSTR_LENGTH(unicode);
3984 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003985}
3986
Alexander Belopolsky40018472011-02-26 01:02:56 +00003987Py_UNICODE *
3988PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991}
3992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993
Alexander Belopolsky40018472011-02-26 01:02:56 +00003994Py_ssize_t
3995PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996{
3997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 goto onError;
4000 }
4001 return PyUnicode_GET_SIZE(unicode);
4002
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return -1;
4005}
4006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007Py_ssize_t
4008PyUnicode_GetLength(PyObject *unicode)
4009{
Victor Stinner07621332012-06-16 04:53:46 +02004010 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 PyErr_BadArgument();
4012 return -1;
4013 }
Victor Stinner07621332012-06-16 04:53:46 +02004014 if (PyUnicode_READY(unicode) == -1)
4015 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 return PyUnicode_GET_LENGTH(unicode);
4017}
4018
4019Py_UCS4
4020PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4021{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004022 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4023 PyErr_BadArgument();
4024 return (Py_UCS4)-1;
4025 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004026 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004027 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 return (Py_UCS4)-1;
4029 }
4030 return PyUnicode_READ_CHAR(unicode, index);
4031}
4032
4033int
4034PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4035{
4036 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004037 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 return -1;
4039 }
Victor Stinner488fa492011-12-12 00:01:39 +01004040 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004041 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004042 PyErr_SetString(PyExc_IndexError, "string index out of range");
4043 return -1;
4044 }
Victor Stinner488fa492011-12-12 00:01:39 +01004045 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004046 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004047 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4048 PyErr_SetString(PyExc_ValueError, "character out of range");
4049 return -1;
4050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4052 index, ch);
4053 return 0;
4054}
4055
Alexander Belopolsky40018472011-02-26 01:02:56 +00004056const char *
4057PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004058{
Victor Stinner42cb4622010-09-01 19:39:01 +00004059 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004060}
4061
Victor Stinner554f3f02010-06-16 23:33:54 +00004062/* create or adjust a UnicodeDecodeError */
4063static void
4064make_decode_exception(PyObject **exceptionObject,
4065 const char *encoding,
4066 const char *input, Py_ssize_t length,
4067 Py_ssize_t startpos, Py_ssize_t endpos,
4068 const char *reason)
4069{
4070 if (*exceptionObject == NULL) {
4071 *exceptionObject = PyUnicodeDecodeError_Create(
4072 encoding, input, length, startpos, endpos, reason);
4073 }
4074 else {
4075 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4076 goto onError;
4077 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4078 goto onError;
4079 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4080 goto onError;
4081 }
4082 return;
4083
4084onError:
4085 Py_DECREF(*exceptionObject);
4086 *exceptionObject = NULL;
4087}
4088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089/* error handling callback helper:
4090 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004091 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 and adjust various state variables.
4093 return 0 on success, -1 on error
4094*/
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096static int
4097unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004098 const char *encoding, const char *reason,
4099 const char **input, const char **inend, Py_ssize_t *startinpos,
4100 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004101 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004103 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104
4105 PyObject *restuple = NULL;
4106 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004107 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004108 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004109 Py_ssize_t requiredsize;
4110 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004111 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 int res = -1;
4113
Victor Stinner596a6c42011-11-09 00:02:18 +01004114 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4115 outsize = PyUnicode_GET_LENGTH(*output);
4116 else
4117 outsize = _PyUnicode_WSTR_LENGTH(*output);
4118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 *errorHandler = PyCodec_LookupError(errors);
4121 if (*errorHandler == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 }
4124
Victor Stinner554f3f02010-06-16 23:33:54 +00004125 make_decode_exception(exceptionObject,
4126 encoding,
4127 *input, *inend - *input,
4128 *startinpos, *endinpos,
4129 reason);
4130 if (*exceptionObject == NULL)
4131 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132
4133 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4134 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004137 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 }
4140 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004142 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004143 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144
4145 /* Copy back the bytes variables, which might have been modified by the
4146 callback */
4147 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4148 if (!inputobj)
4149 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004150 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004152 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004153 *input = PyBytes_AS_STRING(inputobj);
4154 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004156 /* we can DECREF safely, as the exception has another reference,
4157 so the object won't go away. */
4158 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004162 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4164 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166
Victor Stinner596a6c42011-11-09 00:02:18 +01004167 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4168 /* need more space? (at least enough for what we
4169 have+the replacement+the rest of the string (starting
4170 at the new input position), so we won't have to check space
4171 when there are no errors in the rest of the string) */
4172 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4173 requiredsize = *outpos + replen + insize-newpos;
4174 if (requiredsize > outsize) {
4175 if (requiredsize<2*outsize)
4176 requiredsize = 2*outsize;
4177 if (unicode_resize(output, requiredsize) < 0)
4178 goto onError;
4179 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004180 if (unicode_widen(output, *outpos,
4181 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004183 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004184 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004186 else {
4187 wchar_t *repwstr;
4188 Py_ssize_t repwlen;
4189 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4190 if (repwstr == NULL)
4191 goto onError;
4192 /* need more space? (at least enough for what we
4193 have+the replacement+the rest of the string (starting
4194 at the new input position), so we won't have to check space
4195 when there are no errors in the rest of the string) */
4196 requiredsize = *outpos + repwlen + insize-newpos;
4197 if (requiredsize > outsize) {
4198 if (requiredsize < 2*outsize)
4199 requiredsize = 2*outsize;
4200 if (unicode_resize(output, requiredsize) < 0)
4201 goto onError;
4202 }
4203 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4204 *outpos += repwlen;
4205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004207 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 /* we made it! */
4210 res = 0;
4211
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 Py_XDECREF(restuple);
4214 return res;
4215}
4216
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217/* --- UTF-7 Codec -------------------------------------------------------- */
4218
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219/* See RFC2152 for details. We encode conservatively and decode liberally. */
4220
4221/* Three simple macros defining base-64. */
4222
4223/* Is c a base-64 character? */
4224
4225#define IS_BASE64(c) \
4226 (((c) >= 'A' && (c) <= 'Z') || \
4227 ((c) >= 'a' && (c) <= 'z') || \
4228 ((c) >= '0' && (c) <= '9') || \
4229 (c) == '+' || (c) == '/')
4230
4231/* given that c is a base-64 character, what is its base-64 value? */
4232
4233#define FROM_BASE64(c) \
4234 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4235 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4236 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4237 (c) == '+' ? 62 : 63)
4238
4239/* What is the base-64 character of the bottom 6 bits of n? */
4240
4241#define TO_BASE64(n) \
4242 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4243
4244/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4245 * decoded as itself. We are permissive on decoding; the only ASCII
4246 * byte not decoding to itself is the + which begins a base64
4247 * string. */
4248
4249#define DECODE_DIRECT(c) \
4250 ((c) <= 127 && (c) != '+')
4251
4252/* The UTF-7 encoder treats ASCII characters differently according to
4253 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4254 * the above). See RFC2152. This array identifies these different
4255 * sets:
4256 * 0 : "Set D"
4257 * alphanumeric and '(),-./:?
4258 * 1 : "Set O"
4259 * !"#$%&*;<=>@[]^_`{|}
4260 * 2 : "whitespace"
4261 * ht nl cr sp
4262 * 3 : special (must be base64 encoded)
4263 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4264 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265
Tim Petersced69f82003-09-16 20:30:58 +00004266static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004267char utf7_category[128] = {
4268/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4269 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4270/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4271 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4272/* sp ! " # $ % & ' ( ) * + , - . / */
4273 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4274/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4276/* @ A B C D E F G H I J K L M N O */
4277 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4278/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4280/* ` a b c d e f g h i j k l m n o */
4281 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4282/* p q r s t u v w x y z { | } ~ del */
4283 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284};
4285
Antoine Pitrou244651a2009-05-04 18:56:13 +00004286/* ENCODE_DIRECT: this character should be encoded as itself. The
4287 * answer depends on whether we are encoding set O as itself, and also
4288 * on whether we are encoding whitespace as itself. RFC2152 makes it
4289 * clear that the answers to these questions vary between
4290 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004291
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292#define ENCODE_DIRECT(c, directO, directWS) \
4293 ((c) < 128 && (c) > 0 && \
4294 ((utf7_category[(c)] == 0) || \
4295 (directWS && (utf7_category[(c)] == 2)) || \
4296 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297
Alexander Belopolsky40018472011-02-26 01:02:56 +00004298PyObject *
4299PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004300 Py_ssize_t size,
4301 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004303 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4304}
4305
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306/* The decoder. The only state we preserve is our read position,
4307 * i.e. how many characters we have consumed. So if we end in the
4308 * middle of a shift sequence we have to back off the read position
4309 * and the output to the beginning of the sequence, otherwise we lose
4310 * all the shift state (seen bits, number of bits seen, high
4311 * surrogate). */
4312
Alexander Belopolsky40018472011-02-26 01:02:56 +00004313PyObject *
4314PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004315 Py_ssize_t size,
4316 const char *errors,
4317 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t startinpos;
4321 Py_ssize_t endinpos;
4322 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004324 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 const char *errmsg = "";
4326 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 unsigned int base64bits = 0;
4329 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004330 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 PyObject *errorHandler = NULL;
4332 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004334 /* Start off assuming it's all ASCII. Widen later as necessary. */
4335 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 if (!unicode)
4337 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004338 if (size == 0) {
4339 if (consumed)
4340 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004341 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004342 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004344 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 e = s + size;
4346
4347 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004348 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004350 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 if (inShift) { /* in a base-64 section */
4353 if (IS_BASE64(ch)) { /* consume a base-64 character */
4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4355 base64bits += 6;
4356 s++;
4357 if (base64bits >= 16) {
4358 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 base64bits -= 16;
4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4362 if (surrogate) {
4363 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004364 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4365 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004372 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4373 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 }
4376 }
Victor Stinner551ac952011-11-29 22:58:13 +01004377 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 /* first surrogate */
4379 surrogate = outCh;
4380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004382 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4383 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 }
4385 }
4386 }
4387 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 inShift = 0;
4389 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004391 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4392 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004393 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004394 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 if (base64bits > 0) { /* left-over bits */
4396 if (base64bits >= 6) {
4397 /* We've seen at least one base-64 character */
4398 errmsg = "partial character in shift sequence";
4399 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 else {
4402 /* Some bits remain; they should be zero */
4403 if (base64buffer != 0) {
4404 errmsg = "non-zero padding bits in shift sequence";
4405 goto utf7Error;
4406 }
4407 }
4408 }
4409 if (ch != '-') {
4410 /* '-' is absorbed; other terminating
4411 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4413 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
4416 }
4417 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 s++; /* consume '+' */
4420 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4423 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 }
4425 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004427 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 }
4430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4433 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 s++;
4435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 else {
4437 startinpos = s-starts;
4438 s++;
4439 errmsg = "unexpected special character";
4440 goto utf7Error;
4441 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 endinpos = s-starts;
4445 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 errors, &errorHandler,
4447 "utf7", errmsg,
4448 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 }
4452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 /* end of string */
4454
4455 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4456 /* if we're in an inconsistent state, that's an error */
4457 if (surrogate ||
4458 (base64bits >= 6) ||
4459 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 endinpos = size;
4461 if (unicode_decode_call_errorhandler(
4462 errors, &errorHandler,
4463 "utf7", "unterminated shift sequence",
4464 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004465 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 goto onError;
4467 if (s < e)
4468 goto restart;
4469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471
4472 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004475 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004476 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 }
4478 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004479 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004481 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004483 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 goto onError;
4485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004488 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 Py_XDECREF(errorHandler);
4492 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 Py_DECREF(unicode);
4494 return NULL;
4495}
4496
4497
Alexander Belopolsky40018472011-02-26 01:02:56 +00004498PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499_PyUnicode_EncodeUTF7(PyObject *str,
4500 int base64SetO,
4501 int base64WhiteSpace,
4502 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004504 int kind;
4505 void *data;
4506 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004507 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004509 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 unsigned int base64bits = 0;
4511 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 char * out;
4513 char * start;
4514
Benjamin Petersonbac79492012-01-14 13:34:47 -05004515 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 return NULL;
4517 kind = PyUnicode_KIND(str);
4518 data = PyUnicode_DATA(str);
4519 len = PyUnicode_GET_LENGTH(str);
4520
4521 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004525 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004526 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004527 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 if (v == NULL)
4529 return NULL;
4530
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004531 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004532 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004533 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 if (inShift) {
4536 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4537 /* shifting out */
4538 if (base64bits) { /* output remaining bits */
4539 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4540 base64buffer = 0;
4541 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 }
4543 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 /* Characters not in the BASE64 set implicitly unshift the sequence
4545 so no '-' is required, except if the character is itself a '-' */
4546 if (IS_BASE64(ch) || ch == '-') {
4547 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 *out++ = (char) ch;
4550 }
4551 else {
4552 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004553 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 else { /* not in a shift sequence */
4556 if (ch == '+') {
4557 *out++ = '+';
4558 *out++ = '-';
4559 }
4560 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4561 *out++ = (char) ch;
4562 }
4563 else {
4564 *out++ = '+';
4565 inShift = 1;
4566 goto encode_char;
4567 }
4568 }
4569 continue;
4570encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004572 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004573
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 /* code first surrogate */
4575 base64bits += 16;
4576 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4577 while (base64bits >= 6) {
4578 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4579 base64bits -= 6;
4580 }
4581 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004582 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 base64bits += 16;
4585 base64buffer = (base64buffer << 16) | ch;
4586 while (base64bits >= 6) {
4587 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4588 base64bits -= 6;
4589 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 if (base64bits)
4592 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4593 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004595 if (_PyBytes_Resize(&v, out - start) < 0)
4596 return NULL;
4597 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599PyObject *
4600PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4601 Py_ssize_t size,
4602 int base64SetO,
4603 int base64WhiteSpace,
4604 const char *errors)
4605{
4606 PyObject *result;
4607 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4608 if (tmp == NULL)
4609 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004610 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004611 base64WhiteSpace, errors);
4612 Py_DECREF(tmp);
4613 return result;
4614}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616#undef IS_BASE64
4617#undef FROM_BASE64
4618#undef TO_BASE64
4619#undef DECODE_DIRECT
4620#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622/* --- UTF-8 Codec -------------------------------------------------------- */
4623
Alexander Belopolsky40018472011-02-26 01:02:56 +00004624PyObject *
4625PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004626 Py_ssize_t size,
4627 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628{
Walter Dörwald69652032004-09-07 20:24:22 +00004629 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4630}
4631
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004632#include "stringlib/asciilib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004636#include "stringlib/ucs1lib.h"
4637#include "stringlib/codecs.h"
4638#include "stringlib/undef.h"
4639
4640#include "stringlib/ucs2lib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
4644#include "stringlib/ucs4lib.h"
4645#include "stringlib/codecs.h"
4646#include "stringlib/undef.h"
4647
Antoine Pitrouab868312009-01-10 15:40:25 +00004648/* Mask to quickly check whether a C 'long' contains a
4649 non-ASCII, UTF8-encoded char. */
4650#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004651# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004652#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004653# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004654#else
4655# error C 'long' size should be either 4 or 8!
4656#endif
4657
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658static Py_ssize_t
4659ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004660{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004662 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004663
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004665 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4666 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 /* Fast path, see in STRINGLIB(utf8_decode) for
4668 an explanation. */
4669 /* Help register allocation */
4670 register const char *_p = p;
4671 register Py_UCS1 * q = dest;
4672 while (_p < aligned_end) {
4673 unsigned long value = *(const unsigned long *) _p;
4674 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676 *((unsigned long *)q) = value;
4677 _p += SIZEOF_LONG;
4678 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004679 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680 p = _p;
4681 while (p < end) {
4682 if ((unsigned char)*p & 0x80)
4683 break;
4684 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688#endif
4689 while (p < end) {
4690 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4691 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004692 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 /* Help register allocation */
4694 register const char *_p = p;
4695 while (_p < aligned_end) {
4696 unsigned long value = *(unsigned long *) _p;
4697 if (value & ASCII_CHAR_MASK)
4698 break;
4699 _p += SIZEOF_LONG;
4700 }
4701 p = _p;
4702 if (_p == end)
4703 break;
4704 }
4705 if ((unsigned char)*p & 0x80)
4706 break;
4707 ++p;
4708 }
4709 memcpy(dest, start, p - start);
4710 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711}
Antoine Pitrouab868312009-01-10 15:40:25 +00004712
Victor Stinner785938e2011-12-11 20:09:03 +01004713PyObject *
4714PyUnicode_DecodeUTF8Stateful(const char *s,
4715 Py_ssize_t size,
4716 const char *errors,
4717 Py_ssize_t *consumed)
4718{
Victor Stinner785938e2011-12-11 20:09:03 +01004719 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004720 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 const char *end = s + size;
4722 Py_ssize_t outpos;
4723
4724 Py_ssize_t startinpos;
4725 Py_ssize_t endinpos;
4726 const char *errmsg = "";
4727 PyObject *errorHandler = NULL;
4728 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004729
4730 if (size == 0) {
4731 if (consumed)
4732 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004733 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004734 }
4735
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4737 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004738 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 *consumed = 1;
4740 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004741 }
4742
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004744 if (!unicode)
4745 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004746
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4748 s += outpos;
4749 while (s < end) {
4750 Py_UCS4 ch;
4751 int kind = PyUnicode_KIND(unicode);
4752 if (kind == PyUnicode_1BYTE_KIND) {
4753 if (PyUnicode_IS_ASCII(unicode))
4754 ch = asciilib_utf8_decode(&s, end,
4755 PyUnicode_1BYTE_DATA(unicode), &outpos);
4756 else
4757 ch = ucs1lib_utf8_decode(&s, end,
4758 PyUnicode_1BYTE_DATA(unicode), &outpos);
4759 } else if (kind == PyUnicode_2BYTE_KIND) {
4760 ch = ucs2lib_utf8_decode(&s, end,
4761 PyUnicode_2BYTE_DATA(unicode), &outpos);
4762 } else {
4763 assert(kind == PyUnicode_4BYTE_KIND);
4764 ch = ucs4lib_utf8_decode(&s, end,
4765 PyUnicode_4BYTE_DATA(unicode), &outpos);
4766 }
4767
4768 switch (ch) {
4769 case 0:
4770 if (s == end || consumed)
4771 goto End;
4772 errmsg = "unexpected end of data";
4773 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004774 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 break;
4776 case 1:
4777 errmsg = "invalid start byte";
4778 startinpos = s - starts;
4779 endinpos = startinpos + 1;
4780 break;
4781 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004782 case 3:
4783 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 errmsg = "invalid continuation byte";
4785 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004786 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 break;
4788 default:
4789 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4790 goto onError;
4791 continue;
4792 }
4793
4794 if (unicode_decode_call_errorhandler(
4795 errors, &errorHandler,
4796 "utf-8", errmsg,
4797 &starts, &end, &startinpos, &endinpos, &exc, &s,
4798 &unicode, &outpos))
4799 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004800 }
4801
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802End:
4803 if (unicode_resize(&unicode, outpos) < 0)
4804 goto onError;
4805
4806 if (consumed)
4807 *consumed = s - starts;
4808
4809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
4811 assert(_PyUnicode_CheckConsistency(unicode, 1));
4812 return unicode;
4813
4814onError:
4815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
4817 Py_XDECREF(unicode);
4818 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004819}
4820
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#ifdef __APPLE__
4822
4823/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004824 used to decode the command line arguments on Mac OS X.
4825
4826 Return a pointer to a newly allocated wide character string (use
4827 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828
4829wchar_t*
4830_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4831{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 wchar_t *unicode;
4834 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835
4836 /* Note: size will always be longer than the resulting Unicode
4837 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004838 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4841 if (!unicode)
4842 return NULL;
4843
4844 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004847 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004853#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 if (ch > 0xFF) {
4855#if SIZEOF_WCHAR_T == 4
4856 assert(0);
4857#else
4858 assert(Py_UNICODE_IS_SURROGATE(ch));
4859 /* compute and append the two surrogates: */
4860 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4861 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4862#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 else {
4865 if (!ch && s == e)
4866 break;
4867 /* surrogateescape */
4868 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4869 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004870 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004872 return unicode;
4873}
4874
4875#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877/* Primary internal function which creates utf8 encoded bytes objects.
4878
4879 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004880 and allocate exactly as much space needed at the end. Else allocate the
4881 maximum possible needed (4 result bytes per Unicode character), and return
4882 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004883*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004884PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004885_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886{
Victor Stinner6099a032011-12-18 14:22:26 +01004887 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888 void *data;
4889 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891 if (!PyUnicode_Check(unicode)) {
4892 PyErr_BadArgument();
4893 return NULL;
4894 }
4895
4896 if (PyUnicode_READY(unicode) == -1)
4897 return NULL;
4898
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004899 if (PyUnicode_UTF8(unicode))
4900 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4901 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902
4903 kind = PyUnicode_KIND(unicode);
4904 data = PyUnicode_DATA(unicode);
4905 size = PyUnicode_GET_LENGTH(unicode);
4906
Benjamin Petersonead6b532011-12-20 17:23:42 -06004907 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004908 default:
4909 assert(0);
4910 case PyUnicode_1BYTE_KIND:
4911 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4912 assert(!PyUnicode_IS_ASCII(unicode));
4913 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4914 case PyUnicode_2BYTE_KIND:
4915 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4916 case PyUnicode_4BYTE_KIND:
4917 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919}
4920
Alexander Belopolsky40018472011-02-26 01:02:56 +00004921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4923 Py_ssize_t size,
4924 const char *errors)
4925{
4926 PyObject *v, *unicode;
4927
4928 unicode = PyUnicode_FromUnicode(s, size);
4929 if (unicode == NULL)
4930 return NULL;
4931 v = _PyUnicode_AsUTF8String(unicode, errors);
4932 Py_DECREF(unicode);
4933 return v;
4934}
4935
4936PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004937PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940}
4941
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942/* --- UTF-32 Codec ------------------------------------------------------- */
4943
4944PyObject *
4945PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949{
4950 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4951}
4952
4953PyObject *
4954PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder,
4958 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959{
4960 const char *starts = s;
4961 Py_ssize_t startinpos;
4962 Py_ssize_t endinpos;
4963 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004964 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004965 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966 int bo = 0; /* assume native ordering by default */
4967 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 /* Offsets from q for retrieving bytes in the right order. */
4969#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4970 int iorder[] = {0, 1, 2, 3};
4971#else
4972 int iorder[] = {3, 2, 1, 0};
4973#endif
4974 PyObject *errorHandler = NULL;
4975 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004976
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 q = (unsigned char *)s;
4978 e = q + size;
4979
4980 if (byteorder)
4981 bo = *byteorder;
4982
4983 /* Check for BOM marks (U+FEFF) in the input and adjust current
4984 byte order setting accordingly. In native mode, the leading BOM
4985 mark is skipped, in all other modes, it is copied to the output
4986 stream as-is (giving a ZWNBSP character). */
4987 if (bo == 0) {
4988 if (size >= 4) {
4989 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 if (bom == 0x0000FEFF) {
4993 q += 4;
4994 bo = -1;
4995 }
4996 else if (bom == 0xFFFE0000) {
4997 q += 4;
4998 bo = 1;
4999 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 if (bom == 0x0000FEFF) {
5002 q += 4;
5003 bo = 1;
5004 }
5005 else if (bom == 0xFFFE0000) {
5006 q += 4;
5007 bo = -1;
5008 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 }
5012
5013 if (bo == -1) {
5014 /* force LE */
5015 iorder[0] = 0;
5016 iorder[1] = 1;
5017 iorder[2] = 2;
5018 iorder[3] = 3;
5019 }
5020 else if (bo == 1) {
5021 /* force BE */
5022 iorder[0] = 3;
5023 iorder[1] = 2;
5024 iorder[2] = 1;
5025 iorder[3] = 0;
5026 }
5027
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005028 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005029 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005030 if (!unicode)
5031 return NULL;
5032 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005033 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005034 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005035
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 Py_UCS4 ch;
5038 /* remaining bytes at the end? (size should be divisible by 4) */
5039 if (e-q<4) {
5040 if (consumed)
5041 break;
5042 errmsg = "truncated data";
5043 startinpos = ((const char *)q)-starts;
5044 endinpos = ((const char *)e)-starts;
5045 goto utf32Error;
5046 /* The remaining input chars are ignored if the callback
5047 chooses to skip the input */
5048 }
5049 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5050 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 if (ch >= 0x110000)
5053 {
5054 errmsg = "codepoint not in range(0x110000)";
5055 startinpos = ((const char *)q)-starts;
5056 endinpos = startinpos+4;
5057 goto utf32Error;
5058 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005059 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5060 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 q += 4;
5062 continue;
5063 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 if (unicode_decode_call_errorhandler(
5065 errors, &errorHandler,
5066 "utf32", errmsg,
5067 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005068 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 }
5071
5072 if (byteorder)
5073 *byteorder = bo;
5074
5075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
5078 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005079 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 goto onError;
5081
5082 Py_XDECREF(errorHandler);
5083 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005084 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087 Py_DECREF(unicode);
5088 Py_XDECREF(errorHandler);
5089 Py_XDECREF(exc);
5090 return NULL;
5091}
5092
5093PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005094_PyUnicode_EncodeUTF32(PyObject *str,
5095 const char *errors,
5096 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 int kind;
5099 void *data;
5100 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005101 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005103 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 /* Offsets from p for storing byte pairs in the right order. */
5105#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5106 int iorder[] = {0, 1, 2, 3};
5107#else
5108 int iorder[] = {3, 2, 1, 0};
5109#endif
5110
Benjamin Peterson29060642009-01-31 22:14:21 +00005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
5130 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005131 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005141 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
5143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
5149 }
5150 else if (byteorder == 1) {
5151 /* force BE */
5152 iorder[0] = 3;
5153 iorder[1] = 2;
5154 iorder[2] = 1;
5155 iorder[3] = 0;
5156 }
5157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005158 for (i = 0; i < len; i++)
5159 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005160
5161 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005162 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163#undef STORECHAR
5164}
5165
Alexander Belopolsky40018472011-02-26 01:02:56 +00005166PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005167PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5168 Py_ssize_t size,
5169 const char *errors,
5170 int byteorder)
5171{
5172 PyObject *result;
5173 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5174 if (tmp == NULL)
5175 return NULL;
5176 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5177 Py_DECREF(tmp);
5178 return result;
5179}
5180
5181PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005182PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005183{
Victor Stinnerb960b342011-11-20 19:12:52 +01005184 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185}
5186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187/* --- UTF-16 Codec ------------------------------------------------------- */
5188
Tim Peters772747b2001-08-09 22:21:55 +00005189PyObject *
5190PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 Py_ssize_t size,
5192 const char *errors,
5193 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
Walter Dörwald69652032004-09-07 20:24:22 +00005195 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5196}
5197
5198PyObject *
5199PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 Py_ssize_t size,
5201 const char *errors,
5202 int *byteorder,
5203 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005204{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t startinpos;
5207 Py_ssize_t endinpos;
5208 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005209 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005210 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005211 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005213 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Tim Peters772747b2001-08-09 22:21:55 +00005217 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005218 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
5220 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005221 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005223 /* Check for BOM marks (U+FEFF) in the input and adjust current
5224 byte order setting accordingly. In native mode, the leading BOM
5225 mark is skipped, in all other modes, it is copied to the output
5226 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 if (bo == 0 && size >= 2) {
5228 const Py_UCS4 bom = (q[1] << 8) | q[0];
5229 if (bom == 0xFEFF) {
5230 q += 2;
5231 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005233 else if (bom == 0xFFFE) {
5234 q += 2;
5235 bo = 1;
5236 }
5237 if (byteorder)
5238 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 if (q == e) {
5242 if (consumed)
5243 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005244 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005245 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246
Antoine Pitrouab868312009-01-10 15:40:25 +00005247#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005248 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005249#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005250 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005251#endif
Tim Peters772747b2001-08-09 22:21:55 +00005252
Antoine Pitrou63065d72012-05-15 23:48:04 +02005253 /* Note: size will always be longer than the resulting Unicode
5254 character count */
5255 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5256 if (!unicode)
5257 return NULL;
5258
5259 outpos = 0;
5260 while (1) {
5261 Py_UCS4 ch = 0;
5262 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005263 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005264 if (kind == PyUnicode_1BYTE_KIND) {
5265 if (PyUnicode_IS_ASCII(unicode))
5266 ch = asciilib_utf16_decode(&q, e,
5267 PyUnicode_1BYTE_DATA(unicode), &outpos,
5268 native_ordering);
5269 else
5270 ch = ucs1lib_utf16_decode(&q, e,
5271 PyUnicode_1BYTE_DATA(unicode), &outpos,
5272 native_ordering);
5273 } else if (kind == PyUnicode_2BYTE_KIND) {
5274 ch = ucs2lib_utf16_decode(&q, e,
5275 PyUnicode_2BYTE_DATA(unicode), &outpos,
5276 native_ordering);
5277 } else {
5278 assert(kind == PyUnicode_4BYTE_KIND);
5279 ch = ucs4lib_utf16_decode(&q, e,
5280 PyUnicode_4BYTE_DATA(unicode), &outpos,
5281 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005282 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005283 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284
Antoine Pitrou63065d72012-05-15 23:48:04 +02005285 switch (ch)
5286 {
5287 case 0:
5288 /* remaining byte at the end? (size should be even) */
5289 if (q == e || consumed)
5290 goto End;
5291 errmsg = "truncated data";
5292 startinpos = ((const char *)q) - starts;
5293 endinpos = ((const char *)e) - starts;
5294 break;
5295 /* The remaining input chars are ignored if the callback
5296 chooses to skip the input */
5297 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005298 q -= 2;
5299 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005300 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005301 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005302 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 endinpos = ((const char *)e) - starts;
5304 break;
5305 case 2:
5306 errmsg = "illegal encoding";
5307 startinpos = ((const char *)q) - 2 - starts;
5308 endinpos = startinpos + 2;
5309 break;
5310 case 3:
5311 errmsg = "illegal UTF-16 surrogate";
5312 startinpos = ((const char *)q) - 4 - starts;
5313 endinpos = startinpos + 2;
5314 break;
5315 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005316 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5317 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 continue;
5319 }
5320
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005322 errors,
5323 &errorHandler,
5324 "utf16", errmsg,
5325 &starts,
5326 (const char **)&e,
5327 &startinpos,
5328 &endinpos,
5329 &exc,
5330 (const char **)&q,
5331 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005332 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 }
5335
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336End:
Walter Dörwald69652032004-09-07 20:24:22 +00005337 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005341 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 goto onError;
5343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 Py_XDECREF(errorHandler);
5345 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005346 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 Py_XDECREF(errorHandler);
5351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 return NULL;
5353}
5354
Tim Peters772747b2001-08-09 22:21:55 +00005355PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005356_PyUnicode_EncodeUTF16(PyObject *str,
5357 const char *errors,
5358 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005360 enum PyUnicode_Kind kind;
5361 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005362 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005363 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005364 unsigned short *out;
5365 Py_ssize_t bytesize;
5366 Py_ssize_t pairs;
5367#ifdef WORDS_BIGENDIAN
5368 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005369#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005370 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005371#endif
5372
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373 if (!PyUnicode_Check(str)) {
5374 PyErr_BadArgument();
5375 return NULL;
5376 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005377 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005378 return NULL;
5379 kind = PyUnicode_KIND(str);
5380 data = PyUnicode_DATA(str);
5381 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005382
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005384 if (kind == PyUnicode_4BYTE_KIND) {
5385 const Py_UCS4 *in = (const Py_UCS4 *)data;
5386 const Py_UCS4 *end = in + len;
5387 while (in < end)
5388 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005390 }
5391 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005393 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005394 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (v == NULL)
5396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005399 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005400 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005402 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005404 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005405
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005406 switch (kind) {
5407 case PyUnicode_1BYTE_KIND: {
5408 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5409 break;
Tim Peters772747b2001-08-09 22:21:55 +00005410 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 case PyUnicode_2BYTE_KIND: {
5412 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5413 break;
Tim Peters772747b2001-08-09 22:21:55 +00005414 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005415 case PyUnicode_4BYTE_KIND: {
5416 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5417 break;
5418 }
5419 default:
5420 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005421 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005422
5423 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005424 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425}
5426
Alexander Belopolsky40018472011-02-26 01:02:56 +00005427PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005428PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5429 Py_ssize_t size,
5430 const char *errors,
5431 int byteorder)
5432{
5433 PyObject *result;
5434 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5435 if (tmp == NULL)
5436 return NULL;
5437 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5438 Py_DECREF(tmp);
5439 return result;
5440}
5441
5442PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005443PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446}
5447
5448/* --- Unicode Escape Codec ----------------------------------------------- */
5449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5451 if all the escapes in the string make it still a valid ASCII string.
5452 Returns -1 if any escapes were found which cause the string to
5453 pop out of ASCII range. Otherwise returns the length of the
5454 required buffer to hold the string.
5455 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005456static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005457length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5458{
5459 const unsigned char *p = (const unsigned char *)s;
5460 const unsigned char *end = p + size;
5461 Py_ssize_t length = 0;
5462
5463 if (size < 0)
5464 return -1;
5465
5466 for (; p < end; ++p) {
5467 if (*p > 127) {
5468 /* Non-ASCII */
5469 return -1;
5470 }
5471 else if (*p != '\\') {
5472 /* Normal character */
5473 ++length;
5474 }
5475 else {
5476 /* Backslash-escape, check next char */
5477 ++p;
5478 /* Escape sequence reaches till end of string or
5479 non-ASCII follow-up. */
5480 if (p >= end || *p > 127)
5481 return -1;
5482 switch (*p) {
5483 case '\n':
5484 /* backslash + \n result in zero characters */
5485 break;
5486 case '\\': case '\'': case '\"':
5487 case 'b': case 'f': case 't':
5488 case 'n': case 'r': case 'v': case 'a':
5489 ++length;
5490 break;
5491 case '0': case '1': case '2': case '3':
5492 case '4': case '5': case '6': case '7':
5493 case 'x': case 'u': case 'U': case 'N':
5494 /* these do not guarantee ASCII characters */
5495 return -1;
5496 default:
5497 /* count the backslash + the other character */
5498 length += 2;
5499 }
5500 }
5501 }
5502 return length;
5503}
5504
Fredrik Lundh06d12682001-01-24 07:59:11 +00005505static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005506
Alexander Belopolsky40018472011-02-26 01:02:56 +00005507PyObject *
5508PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005509 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005510 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005513 Py_ssize_t startinpos;
5514 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005515 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005517 char* message;
5518 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 PyObject *errorHandler = NULL;
5520 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005521 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005523
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525
5526 /* After length_of_escaped_ascii_string() there are two alternatives,
5527 either the string is pure ASCII with named escapes like \n, etc.
5528 and we determined it's exact size (common case)
5529 or it contains \x, \u, ... escape sequences. then we create a
5530 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005531 if (len >= 0) {
5532 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533 if (!v)
5534 goto onError;
5535 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 }
5537 else {
5538 /* Escaped strings will always be longer than the resulting
5539 Unicode string, so we start with size here and then reduce the
5540 length after conversion to the true value.
5541 (but if the error callback returns a long replacement string
5542 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005543 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 if (!v)
5545 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547 }
5548
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005550 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 while (s < end) {
5555 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005556 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 /* The only case in which i == ascii_length is a backslash
5560 followed by a newline. */
5561 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 /* Non-escape characters are interpreted as Unicode ordinals */
5564 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005565 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5566 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 continue;
5568 }
5569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 /* \ - Escapes */
5572 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005573 c = *s++;
5574 if (s > end)
5575 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577 /* The only case in which i == ascii_length is a backslash
5578 followed by a newline. */
5579 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005581 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584#define WRITECHAR(ch) \
5585 do { \
5586 if (unicode_putchar(&v, &i, ch) < 0) \
5587 goto onError; \
5588 }while(0)
5589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005591 case '\\': WRITECHAR('\\'); break;
5592 case '\'': WRITECHAR('\''); break;
5593 case '\"': WRITECHAR('\"'); break;
5594 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005596 case 'f': WRITECHAR('\014'); break;
5597 case 't': WRITECHAR('\t'); break;
5598 case 'n': WRITECHAR('\n'); break;
5599 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005600 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005603 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 case '0': case '1': case '2': case '3':
5607 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005608 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005609 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005610 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005611 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005612 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005614 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 break;
5616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* hex escapes */
5618 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005620 digits = 2;
5621 message = "truncated \\xXX escape";
5622 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005626 digits = 4;
5627 message = "truncated \\uXXXX escape";
5628 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005631 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005632 digits = 8;
5633 message = "truncated \\UXXXXXXXX escape";
5634 hexescape:
5635 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005636 if (end - s < digits) {
5637 /* count only hex digits */
5638 for (; s < end; ++s) {
5639 c = (unsigned char)*s;
5640 if (!Py_ISXDIGIT(c))
5641 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005642 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005643 goto error;
5644 }
5645 for (; digits--; ++s) {
5646 c = (unsigned char)*s;
5647 if (!Py_ISXDIGIT(c))
5648 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005649 chr = (chr<<4) & ~0xF;
5650 if (c >= '0' && c <= '9')
5651 chr += c - '0';
5652 else if (c >= 'a' && c <= 'f')
5653 chr += 10 + c - 'a';
5654 else
5655 chr += 10 + c - 'A';
5656 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005657 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 /* _decoding_error will have already written into the
5659 target buffer. */
5660 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005661 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005662 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005663 message = "illegal Unicode character";
5664 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005665 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005666 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005667 break;
5668
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 case 'N':
5671 message = "malformed \\N character escape";
5672 if (ucnhash_CAPI == NULL) {
5673 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5675 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005676 if (ucnhash_CAPI == NULL)
5677 goto ucnhashError;
5678 }
5679 if (*s == '{') {
5680 const char *start = s+1;
5681 /* look for the closing brace */
5682 while (*s != '}' && s < end)
5683 s++;
5684 if (s > start && s < end && *s == '}') {
5685 /* found a name. look it up in the unicode database */
5686 message = "unknown Unicode character name";
5687 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005688 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005689 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005690 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 goto store;
5692 }
5693 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005694 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005695
5696 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005697 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 message = "\\ at end of string";
5699 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005700 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005701 }
5702 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005703 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005704 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005705 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005706 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005708 continue;
5709
5710 error:
5711 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005712 if (unicode_decode_call_errorhandler(
5713 errors, &errorHandler,
5714 "unicodeescape", message,
5715 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005716 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005717 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005718 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005719 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005721#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722
Victor Stinner16e6a802011-12-12 13:24:15 +01005723 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005725 Py_XDECREF(errorHandler);
5726 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005727 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005730 PyErr_SetString(
5731 PyExc_UnicodeError,
5732 "\\N escapes not supported (can't load unicodedata module)"
5733 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005734 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005737 return NULL;
5738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
5744}
5745
5746/* Return a Unicode-Escape string version of the Unicode object.
5747
5748 If quotes is true, the string is enclosed in u"" or u'' quotes as
5749 appropriate.
5750
5751*/
5752
Alexander Belopolsky40018472011-02-26 01:02:56 +00005753PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005757 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005759 int kind;
5760 void *data;
5761 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
Ezio Melottie7f90372012-10-05 03:33:31 +03005763 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005764 escape.
5765
Ezio Melottie7f90372012-10-05 03:33:31 +03005766 For UCS1 strings it's '\xxx', 4 bytes per source character.
5767 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5768 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005769 */
5770
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005771 if (!PyUnicode_Check(unicode)) {
5772 PyErr_BadArgument();
5773 return NULL;
5774 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005775 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005776 return NULL;
5777 len = PyUnicode_GET_LENGTH(unicode);
5778 kind = PyUnicode_KIND(unicode);
5779 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005780 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005781 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5782 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5783 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5784 }
5785
5786 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005787 return PyBytes_FromStringAndSize(NULL, 0);
5788
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005789 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005791
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005792 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005794 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (repr == NULL)
5797 return NULL;
5798
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005799 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005802 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005803
Walter Dörwald79e913e2007-05-12 11:08:06 +00005804 /* Escape backslashes */
5805 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 *p++ = '\\';
5807 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005808 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005811 /* Map 21-bit characters to '\U00xxxxxx' */
5812 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005813 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005814 *p++ = '\\';
5815 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005816 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5817 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5818 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5819 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5820 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5821 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5822 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5823 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005825 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005828 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 *p++ = '\\';
5830 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005831 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5832 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5833 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5834 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005836
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005837 /* Map special whitespace to '\t', \n', '\r' */
5838 else if (ch == '\t') {
5839 *p++ = '\\';
5840 *p++ = 't';
5841 }
5842 else if (ch == '\n') {
5843 *p++ = '\\';
5844 *p++ = 'n';
5845 }
5846 else if (ch == '\r') {
5847 *p++ = '\\';
5848 *p++ = 'r';
5849 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005850
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005851 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005852 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005854 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005855 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5856 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005857 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 /* Copy everything else as-is */
5860 else
5861 *p++ = (char) ch;
5862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005864 assert(p - PyBytes_AS_STRING(repr) > 0);
5865 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5866 return NULL;
5867 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868}
5869
Alexander Belopolsky40018472011-02-26 01:02:56 +00005870PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5872 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 PyObject *result;
5875 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5876 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 result = PyUnicode_AsUnicodeEscapeString(tmp);
5879 Py_DECREF(tmp);
5880 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881}
5882
5883/* --- Raw Unicode Escape Codec ------------------------------------------- */
5884
Alexander Belopolsky40018472011-02-26 01:02:56 +00005885PyObject *
5886PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005887 Py_ssize_t size,
5888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005891 Py_ssize_t startinpos;
5892 Py_ssize_t endinpos;
5893 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005894 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 const char *end;
5896 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 PyObject *errorHandler = NULL;
5898 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005899
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 /* Escaped strings will always be longer than the resulting
5901 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 length after conversion to the true value. (But decoding error
5903 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005908 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005909 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 end = s + size;
5911 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 unsigned char c;
5913 Py_UCS4 x;
5914 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005915 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 /* Non-escape characters are interpreted as Unicode ordinals */
5918 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5920 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005922 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 startinpos = s-starts;
5924
5925 /* \u-escapes are only interpreted iff the number of leading
5926 backslashes if odd */
5927 bs = s;
5928 for (;s < end;) {
5929 if (*s != '\\')
5930 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005931 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5932 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 }
5934 if (((s - bs) & 1) == 0 ||
5935 s >= end ||
5936 (*s != 'u' && *s != 'U')) {
5937 continue;
5938 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005939 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 count = *s=='u' ? 4 : 8;
5941 s++;
5942
5943 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 for (x = 0, i = 0; i < count; ++i, ++s) {
5945 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005946 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 endinpos = s-starts;
5948 if (unicode_decode_call_errorhandler(
5949 errors, &errorHandler,
5950 "rawunicodeescape", "truncated \\uXXXX",
5951 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 goto onError;
5954 goto nextByte;
5955 }
5956 x = (x<<4) & ~0xF;
5957 if (c >= '0' && c <= '9')
5958 x += c - '0';
5959 else if (c >= 'a' && c <= 'f')
5960 x += 10 + c - 'a';
5961 else
5962 x += 10 + c - 'A';
5963 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005964 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005965 if (unicode_putchar(&v, &outpos, x) < 0)
5966 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005967 } else {
5968 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005969 if (unicode_decode_call_errorhandler(
5970 errors, &errorHandler,
5971 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005973 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 nextByte:
5977 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005979 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 Py_XDECREF(errorHandler);
5982 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005983 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 Py_XDECREF(errorHandler);
5988 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 return NULL;
5990}
5991
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005994PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005996 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 char *p;
5998 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999 Py_ssize_t expandsize, pos;
6000 int kind;
6001 void *data;
6002 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006004 if (!PyUnicode_Check(unicode)) {
6005 PyErr_BadArgument();
6006 return NULL;
6007 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006008 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 return NULL;
6010 kind = PyUnicode_KIND(unicode);
6011 data = PyUnicode_DATA(unicode);
6012 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006013 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6014 bytes, and 1 byte characters 4. */
6015 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006016
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006019
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006020 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 if (repr == NULL)
6022 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006026 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 for (pos = 0; pos < len; pos++) {
6028 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 /* Map 32-bit characters to '\Uxxxxxxxx' */
6030 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006031 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006032 *p++ = '\\';
6033 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006034 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6035 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6036 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6037 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6038 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6039 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6040 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6041 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006042 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 *p++ = '\\';
6046 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006047 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6048 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6049 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6050 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* Copy everything else as-is */
6053 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 *p++ = (char) ch;
6055 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006056
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 assert(p > q);
6058 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006059 return NULL;
6060 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006064PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6065 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 PyObject *result;
6068 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6069 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006070 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6072 Py_DECREF(tmp);
6073 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074}
6075
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006076/* --- Unicode Internal Codec ------------------------------------------- */
6077
Alexander Belopolsky40018472011-02-26 01:02:56 +00006078PyObject *
6079_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006080 Py_ssize_t size,
6081 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006082{
6083 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084 Py_ssize_t startinpos;
6085 Py_ssize_t endinpos;
6086 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006087 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006088 const char *end;
6089 const char *reason;
6090 PyObject *errorHandler = NULL;
6091 PyObject *exc = NULL;
6092
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006093 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006094 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006095 1))
6096 return NULL;
6097
Thomas Wouters89f507f2006-12-13 04:49:30 +00006098 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006099 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006100 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006103 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006104 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 end = s + size;
6106
6107 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006108 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006109 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006110 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006111 endinpos = end-starts;
6112 reason = "truncated input";
6113 goto error;
6114 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006115 /* We copy the raw representation one byte at a time because the
6116 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006117 ((char *) &uch)[0] = s[0];
6118 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006119#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006120 ((char *) &uch)[2] = s[2];
6121 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006122#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006123 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006124#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006125 /* We have to sanity check the raw data, otherwise doom looms for
6126 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006127 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006128 endinpos = s - starts + Py_UNICODE_SIZE;
6129 reason = "illegal code point (> 0x10FFFF)";
6130 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006131 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006132#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006133 s += Py_UNICODE_SIZE;
6134#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006135 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006136 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006137 Py_UNICODE uch2;
6138 ((char *) &uch2)[0] = s[0];
6139 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006140 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006141 {
Victor Stinner551ac952011-11-29 22:58:13 +01006142 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006143 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006144 }
6145 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006146#endif
6147
6148 if (unicode_putchar(&v, &outpos, ch) < 0)
6149 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006150 continue;
6151
6152 error:
6153 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006154 if (unicode_decode_call_errorhandler(
6155 errors, &errorHandler,
6156 "unicode_internal", reason,
6157 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006158 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006159 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006160 }
6161
Victor Stinner16e6a802011-12-12 13:24:15 +01006162 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006163 goto onError;
6164 Py_XDECREF(errorHandler);
6165 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006166 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006169 Py_XDECREF(v);
6170 Py_XDECREF(errorHandler);
6171 Py_XDECREF(exc);
6172 return NULL;
6173}
6174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175/* --- Latin-1 Codec ------------------------------------------------------ */
6176
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177PyObject *
6178PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006179 Py_ssize_t size,
6180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006183 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187static void
6188make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006189 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006190 PyObject *unicode,
6191 Py_ssize_t startpos, Py_ssize_t endpos,
6192 const char *reason)
6193{
6194 if (*exceptionObject == NULL) {
6195 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006196 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006197 encoding, unicode, startpos, endpos, reason);
6198 }
6199 else {
6200 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6201 goto onError;
6202 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6203 goto onError;
6204 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6205 goto onError;
6206 return;
6207 onError:
6208 Py_DECREF(*exceptionObject);
6209 *exceptionObject = NULL;
6210 }
6211}
6212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214static void
6215raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006216 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006217 PyObject *unicode,
6218 Py_ssize_t startpos, Py_ssize_t endpos,
6219 const char *reason)
6220{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006221 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006222 encoding, unicode, startpos, endpos, reason);
6223 if (*exceptionObject != NULL)
6224 PyCodec_StrictErrors(*exceptionObject);
6225}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226
6227/* error handling callback helper:
6228 build arguments, call the callback and check the arguments,
6229 put the result into newpos and return the replacement string, which
6230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006231static PyObject *
6232unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006233 PyObject **errorHandler,
6234 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006236 Py_ssize_t startpos, Py_ssize_t endpos,
6237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006239 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006240 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 PyObject *restuple;
6242 PyObject *resunicode;
6243
6244 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 }
6249
Benjamin Petersonbac79492012-01-14 13:34:47 -05006250 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 return NULL;
6252 len = PyUnicode_GET_LENGTH(unicode);
6253
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006254 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006255 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258
6259 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006264 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 Py_DECREF(restuple);
6266 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006267 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006268 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 &resunicode, newpos)) {
6270 Py_DECREF(restuple);
6271 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006273 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6274 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6275 Py_DECREF(restuple);
6276 return NULL;
6277 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 *newpos = len + *newpos;
6280 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6282 Py_DECREF(restuple);
6283 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006284 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 Py_INCREF(resunicode);
6286 Py_DECREF(restuple);
6287 return resunicode;
6288}
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006293 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 /* input state */
6296 Py_ssize_t pos=0, size;
6297 int kind;
6298 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 /* output object */
6300 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 /* pointer into the output */
6302 char *str;
6303 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006305 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6306 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 PyObject *errorHandler = NULL;
6308 PyObject *exc = NULL;
6309 /* the following variable is used for caching string comparisons
6310 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6311 int known_errorHandler = -1;
6312
Benjamin Petersonbac79492012-01-14 13:34:47 -05006313 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006314 return NULL;
6315 size = PyUnicode_GET_LENGTH(unicode);
6316 kind = PyUnicode_KIND(unicode);
6317 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 /* allocate enough for a simple encoding without
6319 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006320 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006321 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006322 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006324 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006325 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326 ressize = size;
6327
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 while (pos < size) {
6329 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 /* can we encode this? */
6332 if (c<limit) {
6333 /* no overflow check, because we know that the space is enough */
6334 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 Py_ssize_t requiredsize;
6339 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 Py_ssize_t collstart = pos;
6343 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006345 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 ++collend;
6347 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6348 if (known_errorHandler==-1) {
6349 if ((errors==NULL) || (!strcmp(errors, "strict")))
6350 known_errorHandler = 1;
6351 else if (!strcmp(errors, "replace"))
6352 known_errorHandler = 2;
6353 else if (!strcmp(errors, "ignore"))
6354 known_errorHandler = 3;
6355 else if (!strcmp(errors, "xmlcharrefreplace"))
6356 known_errorHandler = 4;
6357 else
6358 known_errorHandler = 0;
6359 }
6360 switch (known_errorHandler) {
6361 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006362 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 goto onError;
6364 case 2: /* replace */
6365 while (collstart++<collend)
6366 *str++ = '?'; /* fall through */
6367 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 break;
6370 case 4: /* xmlcharrefreplace */
6371 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 /* determine replacement size */
6373 for (i = collstart, repsize = 0; i < collend; ++i) {
6374 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6375 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006381 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006385 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006387 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006388 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 if (requiredsize > ressize) {
6394 if (requiredsize<2*ressize)
6395 requiredsize = 2*ressize;
6396 if (_PyBytes_Resize(&res, requiredsize))
6397 goto onError;
6398 str = PyBytes_AS_STRING(res) + respos;
6399 ressize = requiredsize;
6400 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 /* generate replacement */
6402 for (i = collstart; i < collend; ++i) {
6403 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 break;
6407 default:
6408 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 encoding, reason, unicode, &exc,
6410 collstart, collend, &newpos);
6411 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006412 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006414 if (PyBytes_Check(repunicode)) {
6415 /* Directly copy bytes result to output. */
6416 repsize = PyBytes_Size(repunicode);
6417 if (repsize > 1) {
6418 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006419 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006420 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6421 Py_DECREF(repunicode);
6422 goto onError;
6423 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006424 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006425 ressize += repsize-1;
6426 }
6427 memcpy(str, PyBytes_AsString(repunicode), repsize);
6428 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006429 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006430 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006431 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 /* need more space? (at least enough for what we
6434 have+the replacement+the rest of the string, so
6435 we won't have to check space for encodable characters) */
6436 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 repsize = PyUnicode_GET_LENGTH(repunicode);
6438 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 if (requiredsize > ressize) {
6440 if (requiredsize<2*ressize)
6441 requiredsize = 2*ressize;
6442 if (_PyBytes_Resize(&res, requiredsize)) {
6443 Py_DECREF(repunicode);
6444 goto onError;
6445 }
6446 str = PyBytes_AS_STRING(res) + respos;
6447 ressize = requiredsize;
6448 }
6449 /* check if there is anything unencodable in the replacement
6450 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 for (i = 0; repsize-->0; ++i, ++str) {
6452 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006454 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006455 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 Py_DECREF(repunicode);
6457 goto onError;
6458 }
6459 *str = (char)c;
6460 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006462 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006464 }
6465 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006466 /* Resize if we allocated to much */
6467 size = str - PyBytes_AS_STRING(res);
6468 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006469 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006470 if (_PyBytes_Resize(&res, size) < 0)
6471 goto onError;
6472 }
6473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 Py_XDECREF(errorHandler);
6475 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006476 return res;
6477
6478 onError:
6479 Py_XDECREF(res);
6480 Py_XDECREF(errorHandler);
6481 Py_XDECREF(exc);
6482 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483}
6484
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
6487PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006488 Py_ssize_t size,
6489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 PyObject *result;
6492 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6493 if (unicode == NULL)
6494 return NULL;
6495 result = unicode_encode_ucs1(unicode, errors, 256);
6496 Py_DECREF(unicode);
6497 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Alexander Belopolsky40018472011-02-26 01:02:56 +00006500PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006501_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
6503 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 PyErr_BadArgument();
6505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006507 if (PyUnicode_READY(unicode) == -1)
6508 return NULL;
6509 /* Fast path: if it is a one-byte string, construct
6510 bytes object directly. */
6511 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6512 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6513 PyUnicode_GET_LENGTH(unicode));
6514 /* Non-Latin-1 characters present. Defer to above function to
6515 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517}
6518
6519PyObject*
6520PyUnicode_AsLatin1String(PyObject *unicode)
6521{
6522 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
6525/* --- 7-bit ASCII Codec -------------------------------------------------- */
6526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527PyObject *
6528PyUnicode_DecodeASCII(const char *s,
6529 Py_ssize_t size,
6530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006533 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006534 int kind;
6535 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006536 Py_ssize_t startinpos;
6537 Py_ssize_t endinpos;
6538 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 const char *e;
6540 PyObject *errorHandler = NULL;
6541 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006542
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006544 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006547 if (size == 1 && (unsigned char)s[0] < 128)
6548 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006549
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006550 unicode = PyUnicode_New(size, 127);
6551 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006555 data = PyUnicode_1BYTE_DATA(unicode);
6556 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6557 if (outpos == size)
6558 return unicode;
6559
6560 s += outpos;
6561 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 register unsigned char c = (unsigned char)*s;
6564 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006565 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 ++s;
6567 }
6568 else {
6569 startinpos = s-starts;
6570 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 if (unicode_decode_call_errorhandler(
6572 errors, &errorHandler,
6573 "ascii", "ordinal not in range(128)",
6574 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006575 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006577 kind = PyUnicode_KIND(unicode);
6578 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006581 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006582 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583 Py_XDECREF(errorHandler);
6584 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006585 assert(_PyUnicode_CheckConsistency(unicode, 1));
6586 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006587
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006589 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 Py_XDECREF(errorHandler);
6591 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return NULL;
6593}
6594
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596PyObject *
6597PyUnicode_EncodeASCII(const Py_UNICODE *p,
6598 Py_ssize_t size,
6599 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 PyObject *result;
6602 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6603 if (unicode == NULL)
6604 return NULL;
6605 result = unicode_encode_ucs1(unicode, errors, 128);
6606 Py_DECREF(unicode);
6607 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006611_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
6613 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 PyErr_BadArgument();
6615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006617 if (PyUnicode_READY(unicode) == -1)
6618 return NULL;
6619 /* Fast path: if it is an ASCII-only string, construct bytes object
6620 directly. Else defer to above function to raise the exception. */
6621 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6622 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6623 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006625}
6626
6627PyObject *
6628PyUnicode_AsASCIIString(PyObject *unicode)
6629{
6630 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Victor Stinner99b95382011-07-04 14:23:54 +02006633#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006634
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006635/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006636
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006637#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638#define NEED_RETRY
6639#endif
6640
Victor Stinner3a50e702011-10-18 21:21:00 +02006641#ifndef WC_ERR_INVALID_CHARS
6642# define WC_ERR_INVALID_CHARS 0x0080
6643#endif
6644
6645static char*
6646code_page_name(UINT code_page, PyObject **obj)
6647{
6648 *obj = NULL;
6649 if (code_page == CP_ACP)
6650 return "mbcs";
6651 if (code_page == CP_UTF7)
6652 return "CP_UTF7";
6653 if (code_page == CP_UTF8)
6654 return "CP_UTF8";
6655
6656 *obj = PyBytes_FromFormat("cp%u", code_page);
6657 if (*obj == NULL)
6658 return NULL;
6659 return PyBytes_AS_STRING(*obj);
6660}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006663is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006664{
6665 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006666 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667
Victor Stinner3a50e702011-10-18 21:21:00 +02006668 if (!IsDBCSLeadByteEx(code_page, *curr))
6669 return 0;
6670
6671 prev = CharPrevExA(code_page, s, curr, 0);
6672 if (prev == curr)
6673 return 1;
6674 /* FIXME: This code is limited to "true" double-byte encodings,
6675 as it assumes an incomplete character consists of a single
6676 byte. */
6677 if (curr - prev == 2)
6678 return 1;
6679 if (!IsDBCSLeadByteEx(code_page, *prev))
6680 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006681 return 0;
6682}
6683
Victor Stinner3a50e702011-10-18 21:21:00 +02006684static DWORD
6685decode_code_page_flags(UINT code_page)
6686{
6687 if (code_page == CP_UTF7) {
6688 /* The CP_UTF7 decoder only supports flags=0 */
6689 return 0;
6690 }
6691 else
6692 return MB_ERR_INVALID_CHARS;
6693}
6694
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006695/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 * Decode a byte string from a Windows code page into unicode object in strict
6697 * mode.
6698 *
6699 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6700 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006702static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006703decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006704 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006705 const char *in,
6706 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707{
Victor Stinner3a50e702011-10-18 21:21:00 +02006708 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006709 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006710 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711
6712 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006713 assert(insize > 0);
6714 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6715 if (outsize <= 0)
6716 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717
6718 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006720 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006721 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 if (*v == NULL)
6723 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006724 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725 }
6726 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006728 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006729 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 }
6733
6734 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006735 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6736 if (outsize <= 0)
6737 goto error;
6738 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006739
Victor Stinner3a50e702011-10-18 21:21:00 +02006740error:
6741 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6742 return -2;
6743 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006744 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745}
6746
Victor Stinner3a50e702011-10-18 21:21:00 +02006747/*
6748 * Decode a byte string from a code page into unicode object with an error
6749 * handler.
6750 *
6751 * Returns consumed size if succeed, or raise a WindowsError or
6752 * UnicodeDecodeError exception and returns -1 on error.
6753 */
6754static int
6755decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 PyObject **v,
6757 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006758 const char *errors)
6759{
6760 const char *startin = in;
6761 const char *endin = in + size;
6762 const DWORD flags = decode_code_page_flags(code_page);
6763 /* Ideally, we should get reason from FormatMessage. This is the Windows
6764 2000 English version of the message. */
6765 const char *reason = "No mapping for the Unicode character exists "
6766 "in the target code page.";
6767 /* each step cannot decode more than 1 character, but a character can be
6768 represented as a surrogate pair */
6769 wchar_t buffer[2], *startout, *out;
6770 int insize, outsize;
6771 PyObject *errorHandler = NULL;
6772 PyObject *exc = NULL;
6773 PyObject *encoding_obj = NULL;
6774 char *encoding;
6775 DWORD err;
6776 int ret = -1;
6777
6778 assert(size > 0);
6779
6780 encoding = code_page_name(code_page, &encoding_obj);
6781 if (encoding == NULL)
6782 return -1;
6783
6784 if (errors == NULL || strcmp(errors, "strict") == 0) {
6785 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6786 UnicodeDecodeError. */
6787 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6788 if (exc != NULL) {
6789 PyCodec_StrictErrors(exc);
6790 Py_CLEAR(exc);
6791 }
6792 goto error;
6793 }
6794
6795 if (*v == NULL) {
6796 /* Create unicode object */
6797 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6798 PyErr_NoMemory();
6799 goto error;
6800 }
Victor Stinnerab595942011-12-17 04:59:06 +01006801 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006802 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006803 if (*v == NULL)
6804 goto error;
6805 startout = PyUnicode_AS_UNICODE(*v);
6806 }
6807 else {
6808 /* Extend unicode object */
6809 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6810 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6811 PyErr_NoMemory();
6812 goto error;
6813 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006814 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 goto error;
6816 startout = PyUnicode_AS_UNICODE(*v) + n;
6817 }
6818
6819 /* Decode the byte string character per character */
6820 out = startout;
6821 while (in < endin)
6822 {
6823 /* Decode a character */
6824 insize = 1;
6825 do
6826 {
6827 outsize = MultiByteToWideChar(code_page, flags,
6828 in, insize,
6829 buffer, Py_ARRAY_LENGTH(buffer));
6830 if (outsize > 0)
6831 break;
6832 err = GetLastError();
6833 if (err != ERROR_NO_UNICODE_TRANSLATION
6834 && err != ERROR_INSUFFICIENT_BUFFER)
6835 {
6836 PyErr_SetFromWindowsErr(0);
6837 goto error;
6838 }
6839 insize++;
6840 }
6841 /* 4=maximum length of a UTF-8 sequence */
6842 while (insize <= 4 && (in + insize) <= endin);
6843
6844 if (outsize <= 0) {
6845 Py_ssize_t startinpos, endinpos, outpos;
6846
6847 startinpos = in - startin;
6848 endinpos = startinpos + 1;
6849 outpos = out - PyUnicode_AS_UNICODE(*v);
6850 if (unicode_decode_call_errorhandler(
6851 errors, &errorHandler,
6852 encoding, reason,
6853 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006854 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 {
6856 goto error;
6857 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006858 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 }
6860 else {
6861 in += insize;
6862 memcpy(out, buffer, outsize * sizeof(wchar_t));
6863 out += outsize;
6864 }
6865 }
6866
6867 /* write a NUL character at the end */
6868 *out = 0;
6869
6870 /* Extend unicode object */
6871 outsize = out - startout;
6872 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006873 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006875 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006876
6877error:
6878 Py_XDECREF(encoding_obj);
6879 Py_XDECREF(errorHandler);
6880 Py_XDECREF(exc);
6881 return ret;
6882}
6883
Victor Stinner3a50e702011-10-18 21:21:00 +02006884static PyObject *
6885decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006886 const char *s, Py_ssize_t size,
6887 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888{
Victor Stinner76a31a62011-11-04 00:05:13 +01006889 PyObject *v = NULL;
6890 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 if (code_page < 0) {
6893 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6894 return NULL;
6895 }
6896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006899
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 do
6901 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006903 if (size > INT_MAX) {
6904 chunk_size = INT_MAX;
6905 final = 0;
6906 done = 0;
6907 }
6908 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006910 {
6911 chunk_size = (int)size;
6912 final = (consumed == NULL);
6913 done = 1;
6914 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 /* Skip trailing lead-byte unless 'final' is set */
6917 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6918 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 if (chunk_size == 0 && done) {
6921 if (v != NULL)
6922 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925
Victor Stinner76a31a62011-11-04 00:05:13 +01006926
6927 converted = decode_code_page_strict(code_page, &v,
6928 s, chunk_size);
6929 if (converted == -2)
6930 converted = decode_code_page_errors(code_page, &v,
6931 s, chunk_size,
6932 errors);
6933 assert(converted != 0);
6934
6935 if (converted < 0) {
6936 Py_XDECREF(v);
6937 return NULL;
6938 }
6939
6940 if (consumed)
6941 *consumed += converted;
6942
6943 s += converted;
6944 size -= converted;
6945 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006946
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006947 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948}
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006951PyUnicode_DecodeCodePageStateful(int code_page,
6952 const char *s,
6953 Py_ssize_t size,
6954 const char *errors,
6955 Py_ssize_t *consumed)
6956{
6957 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6958}
6959
6960PyObject *
6961PyUnicode_DecodeMBCSStateful(const char *s,
6962 Py_ssize_t size,
6963 const char *errors,
6964 Py_ssize_t *consumed)
6965{
6966 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6967}
6968
6969PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006970PyUnicode_DecodeMBCS(const char *s,
6971 Py_ssize_t size,
6972 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006973{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6975}
6976
Victor Stinner3a50e702011-10-18 21:21:00 +02006977static DWORD
6978encode_code_page_flags(UINT code_page, const char *errors)
6979{
6980 if (code_page == CP_UTF8) {
6981 if (winver.dwMajorVersion >= 6)
6982 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6983 and later */
6984 return WC_ERR_INVALID_CHARS;
6985 else
6986 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6987 return 0;
6988 }
6989 else if (code_page == CP_UTF7) {
6990 /* CP_UTF7 only supports flags=0 */
6991 return 0;
6992 }
6993 else {
6994 if (errors != NULL && strcmp(errors, "replace") == 0)
6995 return 0;
6996 else
6997 return WC_NO_BEST_FIT_CHARS;
6998 }
6999}
7000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 * Encode a Unicode string to a Windows code page into a byte string in strict
7003 * mode.
7004 *
7005 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7006 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007009encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007010 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007012{
Victor Stinner554f3f02010-06-16 23:33:54 +00007013 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 BOOL *pusedDefaultChar = &usedDefaultChar;
7015 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007016 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007017 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007018 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 const DWORD flags = encode_code_page_flags(code_page, NULL);
7020 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007021 /* Create a substring so that we can get the UTF-16 representation
7022 of just the slice under consideration. */
7023 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
Martin v. Löwis3d325192011-11-04 18:23:06 +01007025 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007026
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007028 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007030 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007031
Victor Stinner2fc507f2011-11-04 20:06:39 +01007032 substring = PyUnicode_Substring(unicode, offset, offset+len);
7033 if (substring == NULL)
7034 return -1;
7035 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7036 if (p == NULL) {
7037 Py_DECREF(substring);
7038 return -1;
7039 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007040
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007041 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 outsize = WideCharToMultiByte(code_page, flags,
7043 p, size,
7044 NULL, 0,
7045 NULL, pusedDefaultChar);
7046 if (outsize <= 0)
7047 goto error;
7048 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007049 if (pusedDefaultChar && *pusedDefaultChar) {
7050 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007057 if (*outbytes == NULL) {
7058 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007060 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062 }
7063 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const Py_ssize_t n = PyBytes_Size(*outbytes);
7066 if (outsize > PY_SSIZE_T_MAX - n) {
7067 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007068 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007071 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7072 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 }
7077
7078 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 outsize = WideCharToMultiByte(code_page, flags,
7080 p, size,
7081 out, outsize,
7082 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007083 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 if (outsize <= 0)
7085 goto error;
7086 if (pusedDefaultChar && *pusedDefaultChar)
7087 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007091 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7093 return -2;
7094 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007095 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007096}
7097
Victor Stinner3a50e702011-10-18 21:21:00 +02007098/*
7099 * Encode a Unicode string to a Windows code page into a byte string using a
7100 * error handler.
7101 *
7102 * Returns consumed characters if succeed, or raise a WindowsError and returns
7103 * -1 on other error.
7104 */
7105static int
7106encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007107 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007108 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007111 Py_ssize_t pos = unicode_offset;
7112 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 /* Ideally, we should get reason from FormatMessage. This is the Windows
7114 2000 English version of the message. */
7115 const char *reason = "invalid character";
7116 /* 4=maximum length of a UTF-8 sequence */
7117 char buffer[4];
7118 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7119 Py_ssize_t outsize;
7120 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 PyObject *errorHandler = NULL;
7122 PyObject *exc = NULL;
7123 PyObject *encoding_obj = NULL;
7124 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007125 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 PyObject *rep;
7127 int ret = -1;
7128
7129 assert(insize > 0);
7130
7131 encoding = code_page_name(code_page, &encoding_obj);
7132 if (encoding == NULL)
7133 return -1;
7134
7135 if (errors == NULL || strcmp(errors, "strict") == 0) {
7136 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7137 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007138 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 if (exc != NULL) {
7140 PyCodec_StrictErrors(exc);
7141 Py_DECREF(exc);
7142 }
7143 Py_XDECREF(encoding_obj);
7144 return -1;
7145 }
7146
7147 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7148 pusedDefaultChar = &usedDefaultChar;
7149 else
7150 pusedDefaultChar = NULL;
7151
7152 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7153 PyErr_NoMemory();
7154 goto error;
7155 }
7156 outsize = insize * Py_ARRAY_LENGTH(buffer);
7157
7158 if (*outbytes == NULL) {
7159 /* Create string object */
7160 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7161 if (*outbytes == NULL)
7162 goto error;
7163 out = PyBytes_AS_STRING(*outbytes);
7164 }
7165 else {
7166 /* Extend string object */
7167 Py_ssize_t n = PyBytes_Size(*outbytes);
7168 if (n > PY_SSIZE_T_MAX - outsize) {
7169 PyErr_NoMemory();
7170 goto error;
7171 }
7172 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7173 goto error;
7174 out = PyBytes_AS_STRING(*outbytes) + n;
7175 }
7176
7177 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7181 wchar_t chars[2];
7182 int charsize;
7183 if (ch < 0x10000) {
7184 chars[0] = (wchar_t)ch;
7185 charsize = 1;
7186 }
7187 else {
7188 ch -= 0x10000;
7189 chars[0] = 0xd800 + (ch >> 10);
7190 chars[1] = 0xdc00 + (ch & 0x3ff);
7191 charsize = 2;
7192 }
7193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 buffer, Py_ARRAY_LENGTH(buffer),
7197 NULL, pusedDefaultChar);
7198 if (outsize > 0) {
7199 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7200 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007201 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 memcpy(out, buffer, outsize);
7203 out += outsize;
7204 continue;
7205 }
7206 }
7207 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7208 PyErr_SetFromWindowsErr(0);
7209 goto error;
7210 }
7211
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 rep = unicode_encode_call_errorhandler(
7213 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007214 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 if (rep == NULL)
7217 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219
7220 if (PyBytes_Check(rep)) {
7221 outsize = PyBytes_GET_SIZE(rep);
7222 if (outsize != 1) {
7223 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7224 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7225 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7226 Py_DECREF(rep);
7227 goto error;
7228 }
7229 out = PyBytes_AS_STRING(*outbytes) + offset;
7230 }
7231 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7232 out += outsize;
7233 }
7234 else {
7235 Py_ssize_t i;
7236 enum PyUnicode_Kind kind;
7237 void *data;
7238
Benjamin Petersonbac79492012-01-14 13:34:47 -05007239 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 Py_DECREF(rep);
7241 goto error;
7242 }
7243
7244 outsize = PyUnicode_GET_LENGTH(rep);
7245 if (outsize != 1) {
7246 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7247 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7248 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7249 Py_DECREF(rep);
7250 goto error;
7251 }
7252 out = PyBytes_AS_STRING(*outbytes) + offset;
7253 }
7254 kind = PyUnicode_KIND(rep);
7255 data = PyUnicode_DATA(rep);
7256 for (i=0; i < outsize; i++) {
7257 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7258 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007259 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007260 encoding, unicode,
7261 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 "unable to encode error handler result to ASCII");
7263 Py_DECREF(rep);
7264 goto error;
7265 }
7266 *out = (unsigned char)ch;
7267 out++;
7268 }
7269 }
7270 Py_DECREF(rep);
7271 }
7272 /* write a NUL byte */
7273 *out = 0;
7274 outsize = out - PyBytes_AS_STRING(*outbytes);
7275 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7276 if (_PyBytes_Resize(outbytes, outsize) < 0)
7277 goto error;
7278 ret = 0;
7279
7280error:
7281 Py_XDECREF(encoding_obj);
7282 Py_XDECREF(errorHandler);
7283 Py_XDECREF(exc);
7284 return ret;
7285}
7286
Victor Stinner3a50e702011-10-18 21:21:00 +02007287static PyObject *
7288encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007289 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 const char *errors)
7291{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007292 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007294 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007296
Benjamin Petersonbac79492012-01-14 13:34:47 -05007297 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007298 return NULL;
7299 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 if (code_page < 0) {
7302 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7303 return NULL;
7304 }
7305
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007307 return PyBytes_FromStringAndSize(NULL, 0);
7308
Victor Stinner7581cef2011-11-03 22:32:33 +01007309 offset = 0;
7310 do
7311 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007312#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007314 chunks. */
7315 if (len > INT_MAX/2) {
7316 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 done = 0;
7318 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007319 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007323 done = 1;
7324 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007325
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 errors);
7329 if (ret == -2)
7330 ret = encode_code_page_errors(code_page, &outbytes,
7331 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007333 if (ret < 0) {
7334 Py_XDECREF(outbytes);
7335 return NULL;
7336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner7581cef2011-11-03 22:32:33 +01007338 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 return outbytes;
7343}
7344
7345PyObject *
7346PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7347 Py_ssize_t size,
7348 const char *errors)
7349{
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 PyObject *unicode, *res;
7351 unicode = PyUnicode_FromUnicode(p, size);
7352 if (unicode == NULL)
7353 return NULL;
7354 res = encode_code_page(CP_ACP, unicode, errors);
7355 Py_DECREF(unicode);
7356 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357}
7358
7359PyObject *
7360PyUnicode_EncodeCodePage(int code_page,
7361 PyObject *unicode,
7362 const char *errors)
7363{
Victor Stinner7581cef2011-11-03 22:32:33 +01007364 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007365}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyObject *
7368PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007369{
7370 if (!PyUnicode_Check(unicode)) {
7371 PyErr_BadArgument();
7372 return NULL;
7373 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007374 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007375}
7376
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377#undef NEED_RETRY
7378
Victor Stinner99b95382011-07-04 14:23:54 +02007379#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007380
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381/* --- Character Mapping Codec -------------------------------------------- */
7382
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383PyObject *
7384PyUnicode_DecodeCharmap(const char *s,
7385 Py_ssize_t size,
7386 PyObject *mapping,
7387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390 Py_ssize_t startinpos;
7391 Py_ssize_t endinpos;
7392 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007394 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 PyObject *errorHandler = NULL;
7397 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007398
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 /* Default to Latin-1 */
7400 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007403 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007407 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007408 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007409 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007410 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007411 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007412 enum PyUnicode_Kind mapkind;
7413 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007414 Py_UCS4 x;
7415
Benjamin Petersonbac79492012-01-14 13:34:47 -05007416 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007417 return NULL;
7418
7419 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007420 mapdata = PyUnicode_DATA(mapping);
7421 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007423 unsigned char ch;
7424 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7425 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7426 if (outkind == PyUnicode_1BYTE_KIND) {
7427 void *outdata = PyUnicode_DATA(v);
7428 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7429 while (s < e) {
7430 unsigned char ch = *s;
7431 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7432 if (x > maxchar)
7433 goto Error;
7434 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7435 ++s;
7436 }
7437 break;
7438 }
7439 else if (outkind == PyUnicode_2BYTE_KIND) {
7440 void *outdata = PyUnicode_DATA(v);
7441 while (s < e) {
7442 unsigned char ch = *s;
7443 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7444 if (x == 0xFFFE)
7445 goto Error;
7446 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7447 ++s;
7448 }
7449 break;
7450 }
7451 }
7452 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007455 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007456 else
7457 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007458Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007459 if (x == 0xfffe)
7460 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 startinpos = s-starts;
7463 endinpos = startinpos+1;
7464 if (unicode_decode_call_errorhandler(
7465 errors, &errorHandler,
7466 "charmap", "character maps to <undefined>",
7467 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007468 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 goto onError;
7470 }
7471 continue;
7472 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007473
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007474 if (unicode_putchar(&v, &outpos, x) < 0)
7475 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007478 }
7479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 while (s < e) {
7481 unsigned char ch = *s;
7482 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007483
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7485 w = PyLong_FromLong((long)ch);
7486 if (w == NULL)
7487 goto onError;
7488 x = PyObject_GetItem(mapping, w);
7489 Py_DECREF(w);
7490 if (x == NULL) {
7491 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7492 /* No mapping found means: mapping is undefined. */
7493 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007494 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 } else
7496 goto onError;
7497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007500 if (x == Py_None)
7501 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (PyLong_Check(x)) {
7503 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007504 if (value == 0xFFFE)
7505 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007506 if (value < 0 || value > MAX_UNICODE) {
7507 PyErr_Format(PyExc_TypeError,
7508 "character mapping must be in range(0x%lx)",
7509 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 Py_DECREF(x);
7511 goto onError;
7512 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007513 if (unicode_putchar(&v, &outpos, value) < 0) {
7514 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007515 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007519 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007520
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007521 if (PyUnicode_READY(x) == -1) {
7522 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007523 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007524 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007525 targetsize = PyUnicode_GET_LENGTH(x);
7526
7527 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007529 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007530 if (value == 0xFFFE)
7531 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007532 if (unicode_putchar(&v, &outpos, value) < 0) {
7533 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007534 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007535 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 else if (targetsize > 1) {
7538 /* 1-n mapping */
7539 if (targetsize > extrachars) {
7540 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 Py_ssize_t needed = (targetsize - extrachars) + \
7542 (targetsize << 2);
7543 extrachars += needed;
7544 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007545 if (unicode_resize(&v,
7546 PyUnicode_GET_LENGTH(v) + needed) < 0)
7547 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 Py_DECREF(x);
7549 goto onError;
7550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007552 if (unicode_widen(&v, outpos,
7553 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7554 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007555 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007556 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007557 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7558 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 extrachars -= targetsize;
7560 }
7561 /* 1-0 mapping: skip the character */
7562 }
7563 else {
7564 /* wrong return value */
7565 PyErr_SetString(PyExc_TypeError,
7566 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007567 Py_DECREF(x);
7568 goto onError;
7569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_DECREF(x);
7571 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007572 continue;
7573Undefined:
7574 /* undefined mapping */
7575 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007576 startinpos = s-starts;
7577 endinpos = startinpos+1;
7578 if (unicode_decode_call_errorhandler(
7579 errors, &errorHandler,
7580 "charmap", "character maps to <undefined>",
7581 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007582 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007583 goto onError;
7584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007587 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007588 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589 Py_XDECREF(errorHandler);
7590 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007591 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007592
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 Py_XDECREF(errorHandler);
7595 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 Py_XDECREF(v);
7597 return NULL;
7598}
7599
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007600/* Charmap encoding: the lookup table */
7601
Alexander Belopolsky40018472011-02-26 01:02:56 +00007602struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 PyObject_HEAD
7604 unsigned char level1[32];
7605 int count2, count3;
7606 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007607};
7608
7609static PyObject*
7610encoding_map_size(PyObject *obj, PyObject* args)
7611{
7612 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007613 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007615}
7616
7617static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 PyDoc_STR("Return the size (in bytes) of this object") },
7620 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621};
7622
7623static void
7624encoding_map_dealloc(PyObject* o)
7625{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627}
7628
7629static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 "EncodingMap", /*tp_name*/
7632 sizeof(struct encoding_map), /*tp_basicsize*/
7633 0, /*tp_itemsize*/
7634 /* methods */
7635 encoding_map_dealloc, /*tp_dealloc*/
7636 0, /*tp_print*/
7637 0, /*tp_getattr*/
7638 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007639 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 0, /*tp_repr*/
7641 0, /*tp_as_number*/
7642 0, /*tp_as_sequence*/
7643 0, /*tp_as_mapping*/
7644 0, /*tp_hash*/
7645 0, /*tp_call*/
7646 0, /*tp_str*/
7647 0, /*tp_getattro*/
7648 0, /*tp_setattro*/
7649 0, /*tp_as_buffer*/
7650 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7651 0, /*tp_doc*/
7652 0, /*tp_traverse*/
7653 0, /*tp_clear*/
7654 0, /*tp_richcompare*/
7655 0, /*tp_weaklistoffset*/
7656 0, /*tp_iter*/
7657 0, /*tp_iternext*/
7658 encoding_map_methods, /*tp_methods*/
7659 0, /*tp_members*/
7660 0, /*tp_getset*/
7661 0, /*tp_base*/
7662 0, /*tp_dict*/
7663 0, /*tp_descr_get*/
7664 0, /*tp_descr_set*/
7665 0, /*tp_dictoffset*/
7666 0, /*tp_init*/
7667 0, /*tp_alloc*/
7668 0, /*tp_new*/
7669 0, /*tp_free*/
7670 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007671};
7672
7673PyObject*
7674PyUnicode_BuildEncodingMap(PyObject* string)
7675{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 PyObject *result;
7677 struct encoding_map *mresult;
7678 int i;
7679 int need_dict = 0;
7680 unsigned char level1[32];
7681 unsigned char level2[512];
7682 unsigned char *mlevel1, *mlevel2, *mlevel3;
7683 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684 int kind;
7685 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007686 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007689 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690 PyErr_BadArgument();
7691 return NULL;
7692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 kind = PyUnicode_KIND(string);
7694 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007695 length = PyUnicode_GET_LENGTH(string);
7696 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007697 memset(level1, 0xFF, sizeof level1);
7698 memset(level2, 0xFF, sizeof level2);
7699
7700 /* If there isn't a one-to-one mapping of NULL to \0,
7701 or if there are non-BMP characters, we need to use
7702 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007705 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 ch = PyUnicode_READ(kind, data, i);
7708 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007709 need_dict = 1;
7710 break;
7711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007712 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 /* unmapped character */
7714 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 l1 = ch >> 11;
7716 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 if (level1[l1] == 0xFF)
7718 level1[l1] = count2++;
7719 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721 }
7722
7723 if (count2 >= 0xFF || count3 >= 0xFF)
7724 need_dict = 1;
7725
7726 if (need_dict) {
7727 PyObject *result = PyDict_New();
7728 PyObject *key, *value;
7729 if (!result)
7730 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007731 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007733 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007734 if (!key || !value)
7735 goto failed1;
7736 if (PyDict_SetItem(result, key, value) == -1)
7737 goto failed1;
7738 Py_DECREF(key);
7739 Py_DECREF(value);
7740 }
7741 return result;
7742 failed1:
7743 Py_XDECREF(key);
7744 Py_XDECREF(value);
7745 Py_DECREF(result);
7746 return NULL;
7747 }
7748
7749 /* Create a three-level trie */
7750 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7751 16*count2 + 128*count3 - 1);
7752 if (!result)
7753 return PyErr_NoMemory();
7754 PyObject_Init(result, &EncodingMapType);
7755 mresult = (struct encoding_map*)result;
7756 mresult->count2 = count2;
7757 mresult->count3 = count3;
7758 mlevel1 = mresult->level1;
7759 mlevel2 = mresult->level23;
7760 mlevel3 = mresult->level23 + 16*count2;
7761 memcpy(mlevel1, level1, 32);
7762 memset(mlevel2, 0xFF, 16*count2);
7763 memset(mlevel3, 0, 128*count3);
7764 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007765 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007767 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7768 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 /* unmapped character */
7770 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007771 o1 = ch>>11;
7772 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 i2 = 16*mlevel1[o1] + o2;
7774 if (mlevel2[i2] == 0xFF)
7775 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007776 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 i3 = 128*mlevel2[i2] + o3;
7778 mlevel3[i3] = i;
7779 }
7780 return result;
7781}
7782
7783static int
Victor Stinner22168992011-11-20 17:09:18 +01007784encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785{
7786 struct encoding_map *map = (struct encoding_map*)mapping;
7787 int l1 = c>>11;
7788 int l2 = (c>>7) & 0xF;
7789 int l3 = c & 0x7F;
7790 int i;
7791
Victor Stinner22168992011-11-20 17:09:18 +01007792 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794 if (c == 0)
7795 return 0;
7796 /* level 1*/
7797 i = map->level1[l1];
7798 if (i == 0xFF) {
7799 return -1;
7800 }
7801 /* level 2*/
7802 i = map->level23[16*i+l2];
7803 if (i == 0xFF) {
7804 return -1;
7805 }
7806 /* level 3 */
7807 i = map->level23[16*map->count2 + 128*i + l3];
7808 if (i == 0) {
7809 return -1;
7810 }
7811 return i;
7812}
7813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814/* Lookup the character ch in the mapping. If the character
7815 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007816 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007818charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Christian Heimes217cfd12007-12-02 14:31:20 +00007820 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 PyObject *x;
7822
7823 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 x = PyObject_GetItem(mapping, w);
7826 Py_DECREF(w);
7827 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7829 /* No mapping found means: mapping is undefined. */
7830 PyErr_Clear();
7831 x = Py_None;
7832 Py_INCREF(x);
7833 return x;
7834 } else
7835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007837 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 long value = PyLong_AS_LONG(x);
7841 if (value < 0 || value > 255) {
7842 PyErr_SetString(PyExc_TypeError,
7843 "character mapping must be in range(256)");
7844 Py_DECREF(x);
7845 return NULL;
7846 }
7847 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007849 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* wrong return value */
7853 PyErr_Format(PyExc_TypeError,
7854 "character mapping must return integer, bytes or None, not %.400s",
7855 x->ob_type->tp_name);
7856 Py_DECREF(x);
7857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
7859}
7860
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007862charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7865 /* exponentially overallocate to minimize reallocations */
7866 if (requiredsize < 2*outsize)
7867 requiredsize = 2*outsize;
7868 if (_PyBytes_Resize(outobj, requiredsize))
7869 return -1;
7870 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871}
7872
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007875} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007877 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007878 space is available. Return a new reference to the object that
7879 was put in the output buffer, or Py_None, if the mapping was undefined
7880 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007881 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007883charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886 PyObject *rep;
7887 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889
Christian Heimes90aa7642007-12-19 02:45:37 +00007890 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (res == -1)
7894 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 if (outsize<requiredsize)
7896 if (charmapencode_resize(outobj, outpos, requiredsize))
7897 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007898 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 outstart[(*outpos)++] = (char)res;
7900 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 }
7902
7903 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_DECREF(rep);
7908 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 if (PyLong_Check(rep)) {
7911 Py_ssize_t requiredsize = *outpos+1;
7912 if (outsize<requiredsize)
7913 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7914 Py_DECREF(rep);
7915 return enc_EXCEPTION;
7916 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007917 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 else {
7921 const char *repchars = PyBytes_AS_STRING(rep);
7922 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7923 Py_ssize_t requiredsize = *outpos+repsize;
7924 if (outsize<requiredsize)
7925 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7926 Py_DECREF(rep);
7927 return enc_EXCEPTION;
7928 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007929 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 memcpy(outstart + *outpos, repchars, repsize);
7931 *outpos += repsize;
7932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 Py_DECREF(rep);
7935 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936}
7937
7938/* handle an error in PyUnicode_EncodeCharmap
7939 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007940static int
7941charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007942 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007944 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007945 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946{
7947 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007948 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007950 enum PyUnicode_Kind kind;
7951 void *data;
7952 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t collstartpos = *inpos;
7955 Py_ssize_t collendpos = *inpos+1;
7956 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 char *encoding = "charmap";
7958 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return -1;
7965 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 /* find all unencodable characters */
7967 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007969 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007971 val = encoding_map_lookup(ch, mapping);
7972 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 break;
7974 ++collendpos;
7975 continue;
7976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7979 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (rep==NULL)
7981 return -1;
7982 else if (rep!=Py_None) {
7983 Py_DECREF(rep);
7984 break;
7985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 }
7989 /* cache callback name lookup
7990 * (if not done yet, i.e. it's the first error) */
7991 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 if ((errors==NULL) || (!strcmp(errors, "strict")))
7993 *known_errorHandler = 1;
7994 else if (!strcmp(errors, "replace"))
7995 *known_errorHandler = 2;
7996 else if (!strcmp(errors, "ignore"))
7997 *known_errorHandler = 3;
7998 else if (!strcmp(errors, "xmlcharrefreplace"))
7999 *known_errorHandler = 4;
8000 else
8001 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002 }
8003 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008005 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 return -1;
8007 case 2: /* replace */
8008 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 x = charmapencode_output('?', mapping, res, respos);
8010 if (x==enc_EXCEPTION) {
8011 return -1;
8012 }
8013 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008014 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return -1;
8016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 }
8018 /* fall through */
8019 case 3: /* ignore */
8020 *inpos = collendpos;
8021 break;
8022 case 4: /* xmlcharrefreplace */
8023 /* generate replacement (temporarily (mis)uses p) */
8024 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 char buffer[2+29+1+1];
8026 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008027 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 for (cp = buffer; *cp; ++cp) {
8029 x = charmapencode_output(*cp, mapping, res, respos);
8030 if (x==enc_EXCEPTION)
8031 return -1;
8032 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008033 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 return -1;
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 }
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 *inpos = collendpos;
8039 break;
8040 default:
8041 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008042 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008046 if (PyBytes_Check(repunicode)) {
8047 /* Directly copy bytes result to output. */
8048 Py_ssize_t outsize = PyBytes_Size(*res);
8049 Py_ssize_t requiredsize;
8050 repsize = PyBytes_Size(repunicode);
8051 requiredsize = *respos + repsize;
8052 if (requiredsize > outsize)
8053 /* Make room for all additional bytes. */
8054 if (charmapencode_resize(res, respos, requiredsize)) {
8055 Py_DECREF(repunicode);
8056 return -1;
8057 }
8058 memcpy(PyBytes_AsString(*res) + *respos,
8059 PyBytes_AsString(repunicode), repsize);
8060 *respos += repsize;
8061 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008066 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
8068 return -1;
8069 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008070 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008071 data = PyUnicode_DATA(repunicode);
8072 kind = PyUnicode_KIND(repunicode);
8073 for (index = 0; index < repsize; index++) {
8074 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8075 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 return -1;
8079 }
8080 else if (x==enc_FAILED) {
8081 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008082 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
8086 *inpos = newpos;
8087 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089 return 0;
8090}
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093_PyUnicode_EncodeCharmap(PyObject *unicode,
8094 PyObject *mapping,
8095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* output object */
8098 PyObject *res = NULL;
8099 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 PyObject *errorHandler = NULL;
8105 PyObject *exc = NULL;
8106 /* the following variable is used for caching string comparisons
8107 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8108 * 3=ignore, 4=xmlcharrefreplace */
8109 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 return NULL;
8113 size = PyUnicode_GET_LENGTH(unicode);
8114
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 /* Default to Latin-1 */
8116 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 /* allocate enough for a simple encoding without
8120 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 if (res == NULL)
8123 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008124 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (x==enc_EXCEPTION) /* error */
8132 goto onError;
8133 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 &exc,
8136 &known_errorHandler, &errorHandler, errors,
8137 &res, &respos)) {
8138 goto onError;
8139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 else
8142 /* done with this character => adjust input position */
8143 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008147 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008148 if (_PyBytes_Resize(&res, respos) < 0)
8149 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 Py_XDECREF(exc);
8152 Py_XDECREF(errorHandler);
8153 return res;
8154
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 Py_XDECREF(res);
8157 Py_XDECREF(exc);
8158 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 return NULL;
8160}
8161
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162/* Deprecated */
8163PyObject *
8164PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8165 Py_ssize_t size,
8166 PyObject *mapping,
8167 const char *errors)
8168{
8169 PyObject *result;
8170 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8171 if (unicode == NULL)
8172 return NULL;
8173 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8174 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008175 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176}
8177
Alexander Belopolsky40018472011-02-26 01:02:56 +00008178PyObject *
8179PyUnicode_AsCharmapString(PyObject *unicode,
8180 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181{
8182 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 PyErr_BadArgument();
8184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008186 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187}
8188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008190static void
8191make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193 Py_ssize_t startpos, Py_ssize_t endpos,
8194 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 *exceptionObject = _PyUnicodeTranslateError_Create(
8198 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 }
8200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8202 goto onError;
8203 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8206 goto onError;
8207 return;
8208 onError:
8209 Py_DECREF(*exceptionObject);
8210 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 }
8212}
8213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215static void
8216raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218 Py_ssize_t startpos, Py_ssize_t endpos,
8219 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220{
8221 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225}
8226
8227/* error handling callback helper:
8228 build arguments, call the callback and check the arguments,
8229 put the result into newpos and return the replacement string, which
8230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231static PyObject *
8232unicode_translate_call_errorhandler(const char *errors,
8233 PyObject **errorHandler,
8234 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236 Py_ssize_t startpos, Py_ssize_t endpos,
8237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008239 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008241 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 PyObject *restuple;
8243 PyObject *resunicode;
8244
8245 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 }
8250
8251 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255
8256 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008261 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 Py_DECREF(restuple);
8263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 &resunicode, &i_newpos)) {
8267 Py_DECREF(restuple);
8268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008270 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 else
8273 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8276 Py_DECREF(restuple);
8277 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 Py_INCREF(resunicode);
8280 Py_DECREF(restuple);
8281 return resunicode;
8282}
8283
8284/* Lookup the character ch in the mapping and put the result in result,
8285 which must be decrefed by the caller.
8286 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008287static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289{
Christian Heimes217cfd12007-12-02 14:31:20 +00008290 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 PyObject *x;
8292
8293 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 x = PyObject_GetItem(mapping, w);
8296 Py_DECREF(w);
8297 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8299 /* No mapping found means: use 1:1 mapping. */
8300 PyErr_Clear();
8301 *result = NULL;
8302 return 0;
8303 } else
8304 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 }
8306 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 *result = x;
8308 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 long value = PyLong_AS_LONG(x);
8312 long max = PyUnicode_GetMax();
8313 if (value < 0 || value > max) {
8314 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008315 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_DECREF(x);
8317 return -1;
8318 }
8319 *result = x;
8320 return 0;
8321 }
8322 else if (PyUnicode_Check(x)) {
8323 *result = x;
8324 return 0;
8325 }
8326 else {
8327 /* wrong return value */
8328 PyErr_SetString(PyExc_TypeError,
8329 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330 Py_DECREF(x);
8331 return -1;
8332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333}
8334/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 if not reallocate and adjust various state variables.
8336 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008342 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008343 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 /* exponentially overallocate to minimize reallocations */
8345 if (requiredsize < 2 * oldsize)
8346 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008347 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8348 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008350 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 }
8353 return 0;
8354}
8355/* lookup the character, put the result in the output string and adjust
8356 various state variables. Return a new reference to the object that
8357 was put in the output buffer in *result, or Py_None, if the mapping was
8358 undefined (in which case no character was written).
8359 The called must decref result.
8360 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8363 PyObject *mapping, Py_UCS4 **output,
8364 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8368 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 }
8374 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008376 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
8380 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 Py_ssize_t repsize;
8382 if (PyUnicode_READY(*res) == -1)
8383 return -1;
8384 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 if (repsize==1) {
8386 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 }
8389 else if (repsize!=0) {
8390 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 Py_ssize_t requiredsize = *opos +
8392 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 Py_ssize_t i;
8395 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 for(i = 0; i < repsize; i++)
8398 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 }
8401 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 return 0;
8404}
8405
Alexander Belopolsky40018472011-02-26 01:02:56 +00008406PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407_PyUnicode_TranslateCharmap(PyObject *input,
8408 PyObject *mapping,
8409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 /* input object */
8412 char *idata;
8413 Py_ssize_t size, i;
8414 int kind;
8415 /* output buffer */
8416 Py_UCS4 *output = NULL;
8417 Py_ssize_t osize;
8418 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 char *reason = "character maps to <undefined>";
8422 PyObject *errorHandler = NULL;
8423 PyObject *exc = NULL;
8424 /* the following variable is used for caching string comparisons
8425 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8426 * 3=ignore, 4=xmlcharrefreplace */
8427 int known_errorHandler = -1;
8428
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 PyErr_BadArgument();
8431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 if (PyUnicode_READY(input) == -1)
8435 return NULL;
8436 idata = (char*)PyUnicode_DATA(input);
8437 kind = PyUnicode_KIND(input);
8438 size = PyUnicode_GET_LENGTH(input);
8439 i = 0;
8440
8441 if (size == 0) {
8442 Py_INCREF(input);
8443 return input;
8444 }
8445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 /* allocate enough for a simple 1:1 translation without
8447 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 osize = size;
8449 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8450 opos = 0;
8451 if (output == NULL) {
8452 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 /* try to encode it */
8458 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 if (charmaptranslate_output(input, i, mapping,
8460 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 Py_XDECREF(x);
8462 goto onError;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 else { /* untranslatable character */
8468 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8469 Py_ssize_t repsize;
8470 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t collstart = i;
8474 Py_ssize_t collend = i+1;
8475 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 while (collend < size) {
8479 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 goto onError;
8481 Py_XDECREF(x);
8482 if (x!=Py_None)
8483 break;
8484 ++collend;
8485 }
8486 /* cache callback name lookup
8487 * (if not done yet, i.e. it's the first error) */
8488 if (known_errorHandler==-1) {
8489 if ((errors==NULL) || (!strcmp(errors, "strict")))
8490 known_errorHandler = 1;
8491 else if (!strcmp(errors, "replace"))
8492 known_errorHandler = 2;
8493 else if (!strcmp(errors, "ignore"))
8494 known_errorHandler = 3;
8495 else if (!strcmp(errors, "xmlcharrefreplace"))
8496 known_errorHandler = 4;
8497 else
8498 known_errorHandler = 0;
8499 }
8500 switch (known_errorHandler) {
8501 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 raise_translate_exception(&exc, input, collstart,
8503 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 case 2: /* replace */
8506 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 for (coll = collstart; coll<collend; coll++)
8508 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 /* fall through */
8510 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 break;
8513 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 /* generate replacement (temporarily (mis)uses i) */
8515 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 char buffer[2+29+1+1];
8517 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8519 if (charmaptranslate_makespace(&output, &osize,
8520 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 goto onError;
8522 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 break;
8527 default:
8528 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 reason, input, &exc,
8530 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008531 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008533 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008534 Py_DECREF(repunicode);
8535 goto onError;
8536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 repsize = PyUnicode_GET_LENGTH(repunicode);
8539 if (charmaptranslate_makespace(&output, &osize,
8540 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 Py_DECREF(repunicode);
8542 goto onError;
8543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 for (uni2 = 0; repsize-->0; ++uni2)
8545 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8546 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008548 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008549 }
8550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8552 if (!res)
8553 goto onError;
8554 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 Py_XDECREF(exc);
8556 Py_XDECREF(errorHandler);
8557 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 Py_XDECREF(exc);
8562 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return NULL;
8564}
8565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566/* Deprecated. Use PyUnicode_Translate instead. */
8567PyObject *
8568PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8569 Py_ssize_t size,
8570 PyObject *mapping,
8571 const char *errors)
8572{
Christian Heimes5f520f42012-09-11 14:03:25 +02008573 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8575 if (!unicode)
8576 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008577 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8578 Py_DECREF(unicode);
8579 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582PyObject *
8583PyUnicode_Translate(PyObject *str,
8584 PyObject *mapping,
8585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586{
8587 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008588
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 str = PyUnicode_FromObject(str);
8590 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008591 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 Py_DECREF(str);
8594 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595}
Tim Petersced69f82003-09-16 20:30:58 +00008596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008598fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599{
8600 /* No need to call PyUnicode_READY(self) because this function is only
8601 called as a callback from fixup() which does it already. */
8602 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8603 const int kind = PyUnicode_KIND(self);
8604 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008605 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008606 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t i;
8608
8609 for (i = 0; i < len; ++i) {
8610 ch = PyUnicode_READ(kind, data, i);
8611 fixed = 0;
8612 if (ch > 127) {
8613 if (Py_UNICODE_ISSPACE(ch))
8614 fixed = ' ';
8615 else {
8616 const int decimal = Py_UNICODE_TODECIMAL(ch);
8617 if (decimal >= 0)
8618 fixed = '0' + decimal;
8619 }
8620 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008622 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 PyUnicode_WRITE(kind, data, i, fixed);
8624 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008625 else
8626 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 }
8629
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008630 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631}
8632
8633PyObject *
8634_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8635{
8636 if (!PyUnicode_Check(unicode)) {
8637 PyErr_BadInternalCall();
8638 return NULL;
8639 }
8640 if (PyUnicode_READY(unicode) == -1)
8641 return NULL;
8642 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8643 /* If the string is already ASCII, just return the same string */
8644 Py_INCREF(unicode);
8645 return unicode;
8646 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008647 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648}
8649
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008650PyObject *
8651PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8652 Py_ssize_t length)
8653{
Victor Stinnerf0124502011-11-21 23:12:56 +01008654 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008655 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008656 Py_UCS4 maxchar;
8657 enum PyUnicode_Kind kind;
8658 void *data;
8659
Victor Stinner99d7ad02012-02-22 13:37:39 +01008660 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008661 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008662 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008663 if (ch > 127) {
8664 int decimal = Py_UNICODE_TODECIMAL(ch);
8665 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008666 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008667 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008668 }
8669 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008670
8671 /* Copy to a new string */
8672 decimal = PyUnicode_New(length, maxchar);
8673 if (decimal == NULL)
8674 return decimal;
8675 kind = PyUnicode_KIND(decimal);
8676 data = PyUnicode_DATA(decimal);
8677 /* Iterate over code points */
8678 for (i = 0; i < length; i++) {
8679 Py_UNICODE ch = s[i];
8680 if (ch > 127) {
8681 int decimal = Py_UNICODE_TODECIMAL(ch);
8682 if (decimal >= 0)
8683 ch = '0' + decimal;
8684 }
8685 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008687 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008688}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008689/* --- Decimal Encoder ---------------------------------------------------- */
8690
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691int
8692PyUnicode_EncodeDecimal(Py_UNICODE *s,
8693 Py_ssize_t length,
8694 char *output,
8695 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008697 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008698 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008699 enum PyUnicode_Kind kind;
8700 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008701
8702 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 PyErr_BadArgument();
8704 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008705 }
8706
Victor Stinner42bf7752011-11-21 22:52:58 +01008707 unicode = PyUnicode_FromUnicode(s, length);
8708 if (unicode == NULL)
8709 return -1;
8710
Benjamin Petersonbac79492012-01-14 13:34:47 -05008711 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008712 Py_DECREF(unicode);
8713 return -1;
8714 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008715 kind = PyUnicode_KIND(unicode);
8716 data = PyUnicode_DATA(unicode);
8717
Victor Stinnerb84d7232011-11-22 01:50:07 +01008718 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008719 PyObject *exc;
8720 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008722 Py_ssize_t startpos;
8723
8724 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008725
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008728 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 decimal = Py_UNICODE_TODECIMAL(ch);
8732 if (decimal >= 0) {
8733 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008734 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 continue;
8736 }
8737 if (0 < ch && ch < 256) {
8738 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008739 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 continue;
8741 }
Victor Stinner6345be92011-11-25 20:09:01 +01008742
Victor Stinner42bf7752011-11-21 22:52:58 +01008743 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008744 exc = NULL;
8745 raise_encode_exception(&exc, "decimal", unicode,
8746 startpos, startpos+1,
8747 "invalid decimal Unicode string");
8748 Py_XDECREF(exc);
8749 Py_DECREF(unicode);
8750 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008751 }
8752 /* 0-terminate the output string */
8753 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008754 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008755 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008756}
8757
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758/* --- Helpers ------------------------------------------------------------ */
8759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008761any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 Py_ssize_t start,
8763 Py_ssize_t end)
8764{
8765 int kind1, kind2, kind;
8766 void *buf1, *buf2;
8767 Py_ssize_t len1, len2, result;
8768
8769 kind1 = PyUnicode_KIND(s1);
8770 kind2 = PyUnicode_KIND(s2);
8771 kind = kind1 > kind2 ? kind1 : kind2;
8772 buf1 = PyUnicode_DATA(s1);
8773 buf2 = PyUnicode_DATA(s2);
8774 if (kind1 != kind)
8775 buf1 = _PyUnicode_AsKind(s1, kind);
8776 if (!buf1)
8777 return -2;
8778 if (kind2 != kind)
8779 buf2 = _PyUnicode_AsKind(s2, kind);
8780 if (!buf2) {
8781 if (kind1 != kind) PyMem_Free(buf1);
8782 return -2;
8783 }
8784 len1 = PyUnicode_GET_LENGTH(s1);
8785 len2 = PyUnicode_GET_LENGTH(s2);
8786
Victor Stinner794d5672011-10-10 03:21:36 +02008787 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008788 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008789 case PyUnicode_1BYTE_KIND:
8790 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8791 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8792 else
8793 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8794 break;
8795 case PyUnicode_2BYTE_KIND:
8796 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8797 break;
8798 case PyUnicode_4BYTE_KIND:
8799 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8800 break;
8801 default:
8802 assert(0); result = -2;
8803 }
8804 }
8805 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008806 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008807 case PyUnicode_1BYTE_KIND:
8808 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8809 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8810 else
8811 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8812 break;
8813 case PyUnicode_2BYTE_KIND:
8814 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8815 break;
8816 case PyUnicode_4BYTE_KIND:
8817 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8818 break;
8819 default:
8820 assert(0); result = -2;
8821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 }
8823
8824 if (kind1 != kind)
8825 PyMem_Free(buf1);
8826 if (kind2 != kind)
8827 PyMem_Free(buf2);
8828
8829 return result;
8830}
8831
8832Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008833_PyUnicode_InsertThousandsGrouping(
8834 PyObject *unicode, Py_ssize_t index,
8835 Py_ssize_t n_buffer,
8836 void *digits, Py_ssize_t n_digits,
8837 Py_ssize_t min_width,
8838 const char *grouping, PyObject *thousands_sep,
8839 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840{
Victor Stinner41a863c2012-02-24 00:37:51 +01008841 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008842 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008843 Py_ssize_t thousands_sep_len;
8844 Py_ssize_t len;
8845
8846 if (unicode != NULL) {
8847 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008848 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008849 }
8850 else {
8851 kind = PyUnicode_1BYTE_KIND;
8852 data = NULL;
8853 }
8854 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8855 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8856 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8857 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008858 if (thousands_sep_kind < kind) {
8859 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8860 if (!thousands_sep_data)
8861 return -1;
8862 }
8863 else {
8864 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8865 if (!data)
8866 return -1;
8867 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008868 }
8869
Benjamin Petersonead6b532011-12-20 17:23:42 -06008870 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008872 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008874 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008877 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008878 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008879 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008880 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008881 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008882 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008884 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008885 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008886 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008887 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008891 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008892 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008893 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008894 break;
8895 default:
8896 assert(0);
8897 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008899 if (unicode != NULL && thousands_sep_kind != kind) {
8900 if (thousands_sep_kind < kind)
8901 PyMem_Free(thousands_sep_data);
8902 else
8903 PyMem_Free(data);
8904 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008905 if (unicode == NULL) {
8906 *maxchar = 127;
8907 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008908 *maxchar = MAX_MAXCHAR(*maxchar,
8909 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008910 }
8911 }
8912 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913}
8914
8915
Thomas Wouters477c8d52006-05-27 19:21:47 +00008916/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008917#define ADJUST_INDICES(start, end, len) \
8918 if (end > len) \
8919 end = len; \
8920 else if (end < 0) { \
8921 end += len; \
8922 if (end < 0) \
8923 end = 0; \
8924 } \
8925 if (start < 0) { \
8926 start += len; \
8927 if (start < 0) \
8928 start = 0; \
8929 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008930
Alexander Belopolsky40018472011-02-26 01:02:56 +00008931Py_ssize_t
8932PyUnicode_Count(PyObject *str,
8933 PyObject *substr,
8934 Py_ssize_t start,
8935 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008938 PyObject* str_obj;
8939 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 int kind1, kind2, kind;
8941 void *buf1 = NULL, *buf2 = NULL;
8942 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008943
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008944 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008945 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008947 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008948 if (!sub_obj) {
8949 Py_DECREF(str_obj);
8950 return -1;
8951 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008952 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008953 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 Py_DECREF(str_obj);
8955 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 }
Tim Petersced69f82003-09-16 20:30:58 +00008957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 kind1 = PyUnicode_KIND(str_obj);
8959 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008960 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008963 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008964 if (kind2 > kind) {
8965 Py_DECREF(sub_obj);
8966 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008967 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008968 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008969 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 if (!buf2)
8972 goto onError;
8973 len1 = PyUnicode_GET_LENGTH(str_obj);
8974 len2 = PyUnicode_GET_LENGTH(sub_obj);
8975
8976 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008977 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008979 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8980 result = asciilib_count(
8981 ((Py_UCS1*)buf1) + start, end - start,
8982 buf2, len2, PY_SSIZE_T_MAX
8983 );
8984 else
8985 result = ucs1lib_count(
8986 ((Py_UCS1*)buf1) + start, end - start,
8987 buf2, len2, PY_SSIZE_T_MAX
8988 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 break;
8990 case PyUnicode_2BYTE_KIND:
8991 result = ucs2lib_count(
8992 ((Py_UCS2*)buf1) + start, end - start,
8993 buf2, len2, PY_SSIZE_T_MAX
8994 );
8995 break;
8996 case PyUnicode_4BYTE_KIND:
8997 result = ucs4lib_count(
8998 ((Py_UCS4*)buf1) + start, end - start,
8999 buf2, len2, PY_SSIZE_T_MAX
9000 );
9001 break;
9002 default:
9003 assert(0); result = 0;
9004 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005
9006 Py_DECREF(sub_obj);
9007 Py_DECREF(str_obj);
9008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (kind2 != kind)
9010 PyMem_Free(buf2);
9011
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 onError:
9014 Py_DECREF(sub_obj);
9015 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 if (kind2 != kind && buf2)
9017 PyMem_Free(buf2);
9018 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
9020
Alexander Belopolsky40018472011-02-26 01:02:56 +00009021Py_ssize_t
9022PyUnicode_Find(PyObject *str,
9023 PyObject *sub,
9024 Py_ssize_t start,
9025 Py_ssize_t end,
9026 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009028 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009031 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009033 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009034 if (!sub) {
9035 Py_DECREF(str);
9036 return -2;
9037 }
9038 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9039 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 Py_DECREF(str);
9041 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Tim Petersced69f82003-09-16 20:30:58 +00009043
Victor Stinner794d5672011-10-10 03:21:36 +02009044 result = any_find_slice(direction,
9045 str, sub, start, end
9046 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 Py_DECREF(sub);
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 return result;
9052}
9053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054Py_ssize_t
9055PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9056 Py_ssize_t start, Py_ssize_t end,
9057 int direction)
9058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009060 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (PyUnicode_READY(str) == -1)
9062 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009063 if (start < 0 || end < 0) {
9064 PyErr_SetString(PyExc_IndexError, "string index out of range");
9065 return -2;
9066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (end > PyUnicode_GET_LENGTH(str))
9068 end = PyUnicode_GET_LENGTH(str);
9069 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009070 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9071 kind, end-start, ch, direction);
9072 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009074 else
9075 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076}
9077
Alexander Belopolsky40018472011-02-26 01:02:56 +00009078static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009079tailmatch(PyObject *self,
9080 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081 Py_ssize_t start,
9082 Py_ssize_t end,
9083 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 int kind_self;
9086 int kind_sub;
9087 void *data_self;
9088 void *data_sub;
9089 Py_ssize_t offset;
9090 Py_ssize_t i;
9091 Py_ssize_t end_sub;
9092
9093 if (PyUnicode_READY(self) == -1 ||
9094 PyUnicode_READY(substring) == -1)
9095 return 0;
9096
9097 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 return 1;
9099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9101 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 kind_self = PyUnicode_KIND(self);
9106 data_self = PyUnicode_DATA(self);
9107 kind_sub = PyUnicode_KIND(substring);
9108 data_sub = PyUnicode_DATA(substring);
9109 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9110
9111 if (direction > 0)
9112 offset = end;
9113 else
9114 offset = start;
9115
9116 if (PyUnicode_READ(kind_self, data_self, offset) ==
9117 PyUnicode_READ(kind_sub, data_sub, 0) &&
9118 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9119 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9120 /* If both are of the same kind, memcmp is sufficient */
9121 if (kind_self == kind_sub) {
9122 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009123 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 data_sub,
9125 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009126 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 }
9128 /* otherwise we have to compare each character by first accesing it */
9129 else {
9130 /* We do not need to compare 0 and len(substring)-1 because
9131 the if statement above ensured already that they are equal
9132 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009133 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 for (i = 1; i < end_sub; ++i) {
9135 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9136 PyUnicode_READ(kind_sub, data_sub, i))
9137 return 0;
9138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 }
9142
9143 return 0;
9144}
9145
Alexander Belopolsky40018472011-02-26 01:02:56 +00009146Py_ssize_t
9147PyUnicode_Tailmatch(PyObject *str,
9148 PyObject *substr,
9149 Py_ssize_t start,
9150 Py_ssize_t end,
9151 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009153 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009154
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 str = PyUnicode_FromObject(str);
9156 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 substr = PyUnicode_FromObject(substr);
9159 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 Py_DECREF(str);
9161 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 }
Tim Petersced69f82003-09-16 20:30:58 +00009163
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009164 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 Py_DECREF(str);
9167 Py_DECREF(substr);
9168 return result;
9169}
9170
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171/* Apply fixfct filter to the Unicode object self and return a
9172 reference to the modified object */
9173
Alexander Belopolsky40018472011-02-26 01:02:56 +00009174static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009175fixup(PyObject *self,
9176 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 PyObject *u;
9179 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009180 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009182 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009185 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 /* fix functions return the new maximum character in a string,
9188 if the kind of the resulting unicode object does not change,
9189 everything is fine. Otherwise we need to change the string kind
9190 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009191 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009192
9193 if (maxchar_new == 0) {
9194 /* no changes */;
9195 if (PyUnicode_CheckExact(self)) {
9196 Py_DECREF(u);
9197 Py_INCREF(self);
9198 return self;
9199 }
9200 else
9201 return u;
9202 }
9203
Victor Stinnere6abb482012-05-02 01:15:40 +02009204 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205
Victor Stinnereaab6042011-12-11 22:22:39 +01009206 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009208
9209 /* In case the maximum character changed, we need to
9210 convert the string to the new category. */
9211 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9212 if (v == NULL) {
9213 Py_DECREF(u);
9214 return NULL;
9215 }
9216 if (maxchar_new > maxchar_old) {
9217 /* If the maxchar increased so that the kind changed, not all
9218 characters are representable anymore and we need to fix the
9219 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009220 _PyUnicode_FastCopyCharacters(v, 0,
9221 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009222 maxchar_old = fixfct(v);
9223 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 }
9225 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009226 _PyUnicode_FastCopyCharacters(v, 0,
9227 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009229 Py_DECREF(u);
9230 assert(_PyUnicode_CheckConsistency(v, 1));
9231 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232}
9233
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009234static PyObject *
9235ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009237 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9238 char *resdata, *data = PyUnicode_DATA(self);
9239 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009240
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009241 res = PyUnicode_New(len, 127);
9242 if (res == NULL)
9243 return NULL;
9244 resdata = PyUnicode_DATA(res);
9245 if (lower)
9246 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009248 _Py_bytes_upper(resdata, data, len);
9249 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250}
9251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009253handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255 Py_ssize_t j;
9256 int final_sigma;
9257 Py_UCS4 c;
9258 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009259
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009260 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9261
9262 where ! is a negation and \p{xxx} is a character with property xxx.
9263 */
9264 for (j = i - 1; j >= 0; j--) {
9265 c = PyUnicode_READ(kind, data, j);
9266 if (!_PyUnicode_IsCaseIgnorable(c))
9267 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009269 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9270 if (final_sigma) {
9271 for (j = i + 1; j < length; j++) {
9272 c = PyUnicode_READ(kind, data, j);
9273 if (!_PyUnicode_IsCaseIgnorable(c))
9274 break;
9275 }
9276 final_sigma = j == length || !_PyUnicode_IsCased(c);
9277 }
9278 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279}
9280
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009281static int
9282lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9283 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009285 /* Obscure special case. */
9286 if (c == 0x3A3) {
9287 mapped[0] = handle_capital_sigma(kind, data, length, i);
9288 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291}
9292
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293static Py_ssize_t
9294do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 Py_ssize_t i, k = 0;
9297 int n_res, j;
9298 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009299
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009300 c = PyUnicode_READ(kind, data, 0);
9301 n_res = _PyUnicode_ToUpperFull(c, mapped);
9302 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009303 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 for (i = 1; i < length; i++) {
9307 c = PyUnicode_READ(kind, data, i);
9308 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9309 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009310 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009311 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009312 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009313 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009314 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
9316
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009317static Py_ssize_t
9318do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9319 Py_ssize_t i, k = 0;
9320
9321 for (i = 0; i < length; i++) {
9322 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9323 int n_res, j;
9324 if (Py_UNICODE_ISUPPER(c)) {
9325 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9326 }
9327 else if (Py_UNICODE_ISLOWER(c)) {
9328 n_res = _PyUnicode_ToUpperFull(c, mapped);
9329 }
9330 else {
9331 n_res = 1;
9332 mapped[0] = c;
9333 }
9334 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009335 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336 res[k++] = mapped[j];
9337 }
9338 }
9339 return k;
9340}
9341
9342static Py_ssize_t
9343do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9344 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009346 Py_ssize_t i, k = 0;
9347
9348 for (i = 0; i < length; i++) {
9349 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9350 int n_res, j;
9351 if (lower)
9352 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9353 else
9354 n_res = _PyUnicode_ToUpperFull(c, mapped);
9355 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009356 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009357 res[k++] = mapped[j];
9358 }
9359 }
9360 return k;
9361}
9362
9363static Py_ssize_t
9364do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9365{
9366 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9367}
9368
9369static Py_ssize_t
9370do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9371{
9372 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9373}
9374
Benjamin Petersone51757f2012-01-12 21:10:29 -05009375static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009376do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9377{
9378 Py_ssize_t i, k = 0;
9379
9380 for (i = 0; i < length; i++) {
9381 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9382 Py_UCS4 mapped[3];
9383 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9384 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009385 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009386 res[k++] = mapped[j];
9387 }
9388 }
9389 return k;
9390}
9391
9392static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009393do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9394{
9395 Py_ssize_t i, k = 0;
9396 int previous_is_cased;
9397
9398 previous_is_cased = 0;
9399 for (i = 0; i < length; i++) {
9400 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9401 Py_UCS4 mapped[3];
9402 int n_res, j;
9403
9404 if (previous_is_cased)
9405 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9406 else
9407 n_res = _PyUnicode_ToTitleFull(c, mapped);
9408
9409 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009410 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009411 res[k++] = mapped[j];
9412 }
9413
9414 previous_is_cased = _PyUnicode_IsCased(c);
9415 }
9416 return k;
9417}
9418
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009419static PyObject *
9420case_operation(PyObject *self,
9421 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9422{
9423 PyObject *res = NULL;
9424 Py_ssize_t length, newlength = 0;
9425 int kind, outkind;
9426 void *data, *outdata;
9427 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9428
Benjamin Petersoneea48462012-01-16 14:28:50 -05009429 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009430
9431 kind = PyUnicode_KIND(self);
9432 data = PyUnicode_DATA(self);
9433 length = PyUnicode_GET_LENGTH(self);
9434 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9435 if (tmp == NULL)
9436 return PyErr_NoMemory();
9437 newlength = perform(kind, data, length, tmp, &maxchar);
9438 res = PyUnicode_New(newlength, maxchar);
9439 if (res == NULL)
9440 goto leave;
9441 tmpend = tmp + newlength;
9442 outdata = PyUnicode_DATA(res);
9443 outkind = PyUnicode_KIND(res);
9444 switch (outkind) {
9445 case PyUnicode_1BYTE_KIND:
9446 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9447 break;
9448 case PyUnicode_2BYTE_KIND:
9449 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9450 break;
9451 case PyUnicode_4BYTE_KIND:
9452 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9453 break;
9454 default:
9455 assert(0);
9456 break;
9457 }
9458 leave:
9459 PyMem_FREE(tmp);
9460 return res;
9461}
9462
Tim Peters8ce9f162004-08-27 01:49:32 +00009463PyObject *
9464PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009467 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009469 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009470 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9471 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009472 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009474 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009476 int use_memcpy;
9477 unsigned char *res_data = NULL, *sep_data = NULL;
9478 PyObject *last_obj;
9479 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 fseq = PySequence_Fast(seq, "");
9482 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009483 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009484 }
9485
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009486 /* NOTE: the following code can't call back into Python code,
9487 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009488 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009489
Tim Peters05eba1f2004-08-27 21:32:02 +00009490 seqlen = PySequence_Fast_GET_SIZE(fseq);
9491 /* If empty sequence, return u"". */
9492 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009493 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009494 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009495 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009496
Tim Peters05eba1f2004-08-27 21:32:02 +00009497 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009498 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009499 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009500 if (seqlen == 1) {
9501 if (PyUnicode_CheckExact(items[0])) {
9502 res = items[0];
9503 Py_INCREF(res);
9504 Py_DECREF(fseq);
9505 return res;
9506 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009508 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009509 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009510 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009511 /* Set up sep and seplen */
9512 if (separator == NULL) {
9513 /* fall back to a blank space separator */
9514 sep = PyUnicode_FromOrdinal(' ');
9515 if (!sep)
9516 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009517 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009518 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009519 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009520 else {
9521 if (!PyUnicode_Check(separator)) {
9522 PyErr_Format(PyExc_TypeError,
9523 "separator: expected str instance,"
9524 " %.80s found",
9525 Py_TYPE(separator)->tp_name);
9526 goto onError;
9527 }
9528 if (PyUnicode_READY(separator))
9529 goto onError;
9530 sep = separator;
9531 seplen = PyUnicode_GET_LENGTH(separator);
9532 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9533 /* inc refcount to keep this code path symmetric with the
9534 above case of a blank separator */
9535 Py_INCREF(sep);
9536 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009537 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009538 }
9539
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009540 /* There are at least two things to join, or else we have a subclass
9541 * of str in the sequence.
9542 * Do a pre-pass to figure out the total amount of space we'll
9543 * need (sz), and see whether all argument are strings.
9544 */
9545 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009546#ifdef Py_DEBUG
9547 use_memcpy = 0;
9548#else
9549 use_memcpy = 1;
9550#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009551 for (i = 0; i < seqlen; i++) {
9552 const Py_ssize_t old_sz = sz;
9553 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 if (!PyUnicode_Check(item)) {
9555 PyErr_Format(PyExc_TypeError,
9556 "sequence item %zd: expected str instance,"
9557 " %.80s found",
9558 i, Py_TYPE(item)->tp_name);
9559 goto onError;
9560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 if (PyUnicode_READY(item) == -1)
9562 goto onError;
9563 sz += PyUnicode_GET_LENGTH(item);
9564 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009565 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009566 if (i != 0)
9567 sz += seplen;
9568 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9569 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009571 goto onError;
9572 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009573 if (use_memcpy && last_obj != NULL) {
9574 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9575 use_memcpy = 0;
9576 }
9577 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009578 }
Tim Petersced69f82003-09-16 20:30:58 +00009579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 if (res == NULL)
9582 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009583
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009584 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009585#ifdef Py_DEBUG
9586 use_memcpy = 0;
9587#else
9588 if (use_memcpy) {
9589 res_data = PyUnicode_1BYTE_DATA(res);
9590 kind = PyUnicode_KIND(res);
9591 if (seplen != 0)
9592 sep_data = PyUnicode_1BYTE_DATA(sep);
9593 }
9594#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009596 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009597 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009599 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009600 if (use_memcpy) {
9601 Py_MEMCPY(res_data,
9602 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009603 kind * seplen);
9604 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009605 }
9606 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009607 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009608 res_offset += seplen;
9609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009611 itemlen = PyUnicode_GET_LENGTH(item);
9612 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009613 if (use_memcpy) {
9614 Py_MEMCPY(res_data,
9615 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009616 kind * itemlen);
9617 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 }
9619 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009620 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 res_offset += itemlen;
9622 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009623 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 if (use_memcpy)
9626 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009627 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 else
9629 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009630
Tim Peters05eba1f2004-08-27 21:32:02 +00009631 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009633 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009637 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009639 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return NULL;
9641}
9642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643#define FILL(kind, data, value, start, length) \
9644 do { \
9645 Py_ssize_t i_ = 0; \
9646 assert(kind != PyUnicode_WCHAR_KIND); \
9647 switch ((kind)) { \
9648 case PyUnicode_1BYTE_KIND: { \
9649 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009650 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 break; \
9652 } \
9653 case PyUnicode_2BYTE_KIND: { \
9654 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9655 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9656 break; \
9657 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009658 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9660 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9661 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009662 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 } \
9664 } \
9665 } while (0)
9666
Victor Stinnerd3f08822012-05-29 12:57:52 +02009667void
9668_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9669 Py_UCS4 fill_char)
9670{
9671 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9672 const void *data = PyUnicode_DATA(unicode);
9673 assert(PyUnicode_IS_READY(unicode));
9674 assert(unicode_modifiable(unicode));
9675 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9676 assert(start >= 0);
9677 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9678 FILL(kind, data, fill_char, start, length);
9679}
9680
Victor Stinner3fe55312012-01-04 00:33:50 +01009681Py_ssize_t
9682PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9683 Py_UCS4 fill_char)
9684{
9685 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009686
9687 if (!PyUnicode_Check(unicode)) {
9688 PyErr_BadInternalCall();
9689 return -1;
9690 }
9691 if (PyUnicode_READY(unicode) == -1)
9692 return -1;
9693 if (unicode_check_modifiable(unicode))
9694 return -1;
9695
Victor Stinnerd3f08822012-05-29 12:57:52 +02009696 if (start < 0) {
9697 PyErr_SetString(PyExc_IndexError, "string index out of range");
9698 return -1;
9699 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009700 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9701 PyErr_SetString(PyExc_ValueError,
9702 "fill character is bigger than "
9703 "the string maximum character");
9704 return -1;
9705 }
9706
9707 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9708 length = Py_MIN(maxlen, length);
9709 if (length <= 0)
9710 return 0;
9711
Victor Stinnerd3f08822012-05-29 12:57:52 +02009712 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009713 return length;
9714}
9715
Victor Stinner9310abb2011-10-05 00:59:23 +02009716static PyObject *
9717pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009718 Py_ssize_t left,
9719 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 PyObject *u;
9723 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009724 int kind;
9725 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726
9727 if (left < 0)
9728 left = 0;
9729 if (right < 0)
9730 right = 0;
9731
Victor Stinnerc4b49542011-12-11 22:44:26 +01009732 if (left == 0 && right == 0)
9733 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9736 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009737 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9738 return NULL;
9739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009741 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009743 if (!u)
9744 return NULL;
9745
9746 kind = PyUnicode_KIND(u);
9747 data = PyUnicode_DATA(u);
9748 if (left)
9749 FILL(kind, data, fill, 0, left);
9750 if (right)
9751 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009752 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009753 assert(_PyUnicode_CheckConsistency(u, 1));
9754 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755}
9756
Alexander Belopolsky40018472011-02-26 01:02:56 +00009757PyObject *
9758PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
9762 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009763 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009765 if (PyUnicode_READY(string) == -1) {
9766 Py_DECREF(string);
9767 return NULL;
9768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
Benjamin Petersonead6b532011-12-20 17:23:42 -06009770 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 if (PyUnicode_IS_ASCII(string))
9773 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009775 PyUnicode_GET_LENGTH(string), keepends);
9776 else
9777 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009779 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 break;
9781 case PyUnicode_2BYTE_KIND:
9782 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009783 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 PyUnicode_GET_LENGTH(string), keepends);
9785 break;
9786 case PyUnicode_4BYTE_KIND:
9787 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 PyUnicode_GET_LENGTH(string), keepends);
9790 break;
9791 default:
9792 assert(0);
9793 list = 0;
9794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 Py_DECREF(string);
9796 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009800split(PyObject *self,
9801 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 int kind1, kind2, kind;
9805 void *buf1, *buf2;
9806 Py_ssize_t len1, len2;
9807 PyObject* out;
9808
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009810 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (PyUnicode_READY(self) == -1)
9813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009816 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 if (PyUnicode_IS_ASCII(self))
9819 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 else
9824 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_2BYTE_KIND:
9829 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 case PyUnicode_4BYTE_KIND:
9834 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 PyUnicode_GET_LENGTH(self), maxcount
9837 );
9838 default:
9839 assert(0);
9840 return NULL;
9841 }
9842
9843 if (PyUnicode_READY(substring) == -1)
9844 return NULL;
9845
9846 kind1 = PyUnicode_KIND(self);
9847 kind2 = PyUnicode_KIND(substring);
9848 kind = kind1 > kind2 ? kind1 : kind2;
9849 buf1 = PyUnicode_DATA(self);
9850 buf2 = PyUnicode_DATA(substring);
9851 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009852 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (!buf1)
9854 return NULL;
9855 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009856 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (!buf2) {
9858 if (kind1 != kind) PyMem_Free(buf1);
9859 return NULL;
9860 }
9861 len1 = PyUnicode_GET_LENGTH(self);
9862 len2 = PyUnicode_GET_LENGTH(substring);
9863
Benjamin Petersonead6b532011-12-20 17:23:42 -06009864 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9867 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 else
9870 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 break;
9881 default:
9882 out = NULL;
9883 }
9884 if (kind1 != kind)
9885 PyMem_Free(buf1);
9886 if (kind2 != kind)
9887 PyMem_Free(buf2);
9888 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889}
9890
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009892rsplit(PyObject *self,
9893 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 int kind1, kind2, kind;
9897 void *buf1, *buf2;
9898 Py_ssize_t len1, len2;
9899 PyObject* out;
9900
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009902 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (PyUnicode_READY(self) == -1)
9905 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009908 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 if (PyUnicode_IS_ASCII(self))
9911 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 else
9916 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 case PyUnicode_2BYTE_KIND:
9921 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 case PyUnicode_4BYTE_KIND:
9926 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 default:
9931 assert(0);
9932 return NULL;
9933 }
9934
9935 if (PyUnicode_READY(substring) == -1)
9936 return NULL;
9937
9938 kind1 = PyUnicode_KIND(self);
9939 kind2 = PyUnicode_KIND(substring);
9940 kind = kind1 > kind2 ? kind1 : kind2;
9941 buf1 = PyUnicode_DATA(self);
9942 buf2 = PyUnicode_DATA(substring);
9943 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009944 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (!buf1)
9946 return NULL;
9947 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009948 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (!buf2) {
9950 if (kind1 != kind) PyMem_Free(buf1);
9951 return NULL;
9952 }
9953 len1 = PyUnicode_GET_LENGTH(self);
9954 len2 = PyUnicode_GET_LENGTH(substring);
9955
Benjamin Petersonead6b532011-12-20 17:23:42 -06009956 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009958 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9959 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 else
9962 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 case PyUnicode_2BYTE_KIND:
9966 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_4BYTE_KIND:
9970 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 default:
9974 out = NULL;
9975 }
9976 if (kind1 != kind)
9977 PyMem_Free(buf1);
9978 if (kind2 != kind)
9979 PyMem_Free(buf2);
9980 return out;
9981}
9982
9983static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9985 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009987 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9990 return asciilib_find(buf1, len1, buf2, len2, offset);
9991 else
9992 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 case PyUnicode_2BYTE_KIND:
9994 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9995 case PyUnicode_4BYTE_KIND:
9996 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9997 }
9998 assert(0);
9999 return -1;
10000}
10001
10002static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10004 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010006 switch (kind) {
10007 case PyUnicode_1BYTE_KIND:
10008 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10009 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10010 else
10011 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10012 case PyUnicode_2BYTE_KIND:
10013 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10014 case PyUnicode_4BYTE_KIND:
10015 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10016 }
10017 assert(0);
10018 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010019}
10020
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022replace(PyObject *self, PyObject *str1,
10023 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 PyObject *u;
10026 char *sbuf = PyUnicode_DATA(self);
10027 char *buf1 = PyUnicode_DATA(str1);
10028 char *buf2 = PyUnicode_DATA(str2);
10029 int srelease = 0, release1 = 0, release2 = 0;
10030 int skind = PyUnicode_KIND(self);
10031 int kind1 = PyUnicode_KIND(str1);
10032 int kind2 = PyUnicode_KIND(str2);
10033 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10034 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10035 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010036 int mayshrink;
10037 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
10039 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010040 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010042 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Victor Stinner59de0ee2011-10-07 10:01:28 +020010044 if (str1 == str2)
10045 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (skind < kind1)
10047 /* substring too wide to be present */
10048 goto nothing;
10049
Victor Stinner49a0a212011-10-12 23:46:10 +020010050 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10051 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10052 /* Replacing str1 with str2 may cause a maxchar reduction in the
10053 result string. */
10054 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010055 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010063 Py_UCS4 u1, u2;
10064 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010065 Py_ssize_t index, pos;
10066 char *src;
10067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010069 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10070 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010076 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010078
10079 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10080 index = 0;
10081 src = sbuf;
10082 while (--maxcount)
10083 {
10084 pos++;
10085 src += pos * PyUnicode_KIND(self);
10086 slen -= pos;
10087 index += pos;
10088 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10089 if (pos < 0)
10090 break;
10091 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10092 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010093 }
10094 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 int rkind = skind;
10096 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010097 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (kind1 < rkind) {
10100 /* widen substring */
10101 buf1 = _PyUnicode_AsKind(str1, rkind);
10102 if (!buf1) goto error;
10103 release1 = 1;
10104 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010106 if (i < 0)
10107 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 if (rkind > kind2) {
10109 /* widen replacement */
10110 buf2 = _PyUnicode_AsKind(str2, rkind);
10111 if (!buf2) goto error;
10112 release2 = 1;
10113 }
10114 else if (rkind < kind2) {
10115 /* widen self and buf1 */
10116 rkind = kind2;
10117 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010118 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 sbuf = _PyUnicode_AsKind(self, rkind);
10120 if (!sbuf) goto error;
10121 srelease = 1;
10122 buf1 = _PyUnicode_AsKind(str1, rkind);
10123 if (!buf1) goto error;
10124 release1 = 1;
10125 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 u = PyUnicode_New(slen, maxchar);
10127 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 assert(PyUnicode_KIND(u) == rkind);
10130 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010131
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138
10139 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010143 if (i == -1)
10144 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 }
10152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010154 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 int rkind = skind;
10156 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 buf1 = _PyUnicode_AsKind(str1, rkind);
10161 if (!buf1) goto error;
10162 release1 = 1;
10163 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 if (n == 0)
10166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010168 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 buf2 = _PyUnicode_AsKind(str2, rkind);
10170 if (!buf2) goto error;
10171 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 rkind = kind2;
10176 sbuf = _PyUnicode_AsKind(self, rkind);
10177 if (!sbuf) goto error;
10178 srelease = 1;
10179 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010180 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 buf1 = _PyUnicode_AsKind(str1, rkind);
10182 if (!buf1) goto error;
10183 release1 = 1;
10184 }
10185 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10186 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010187 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 PyErr_SetString(PyExc_OverflowError,
10189 "replace string is too long");
10190 goto error;
10191 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010192 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010194 _Py_INCREF_UNICODE_EMPTY();
10195 if (!unicode_empty)
10196 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 u = unicode_empty;
10198 goto done;
10199 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010200 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 PyErr_SetString(PyExc_OverflowError,
10202 "replace string is too long");
10203 goto error;
10204 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 u = PyUnicode_New(new_size, maxchar);
10206 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 assert(PyUnicode_KIND(u) == rkind);
10209 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 ires = i = 0;
10211 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 while (n-- > 0) {
10213 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010217 if (j == -1)
10218 break;
10219 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 memcpy(res + rkind * ires,
10222 sbuf + rkind * i,
10223 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010225 }
10226 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 memcpy(res + rkind * ires,
10238 sbuf + rkind * i,
10239 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 }
10241 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 /* interleave */
10243 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 if (--n <= 0)
10249 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 memcpy(res + rkind * ires,
10251 sbuf + rkind * i,
10252 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 ires++;
10254 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 memcpy(res + rkind * ires,
10257 sbuf + rkind * i,
10258 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 }
10261
10262 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010263 unicode_adjust_maxchar(&u);
10264 if (u == NULL)
10265 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010267
10268 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (srelease)
10270 PyMem_FREE(sbuf);
10271 if (release1)
10272 PyMem_FREE(buf1);
10273 if (release2)
10274 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010275 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (srelease)
10281 PyMem_FREE(sbuf);
10282 if (release1)
10283 PyMem_FREE(buf1);
10284 if (release2)
10285 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010286 return unicode_result_unchanged(self);
10287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 error:
10289 if (srelease && sbuf)
10290 PyMem_FREE(sbuf);
10291 if (release1 && buf1)
10292 PyMem_FREE(buf1);
10293 if (release2 && buf2)
10294 PyMem_FREE(buf2);
10295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297
10298/* --- Unicode Object Methods --------------------------------------------- */
10299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010300PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302\n\
10303Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305
10306static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010307unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010309 if (PyUnicode_READY(self) == -1)
10310 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010311 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312}
10313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010314PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316\n\
10317Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010318have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319
10320static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010321unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
10325 if (PyUnicode_GET_LENGTH(self) == 0)
10326 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010327 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328}
10329
Benjamin Petersond5890c82012-01-14 13:23:30 -050010330PyDoc_STRVAR(casefold__doc__,
10331 "S.casefold() -> str\n\
10332\n\
10333Return a version of S suitable for caseless comparisons.");
10334
10335static PyObject *
10336unicode_casefold(PyObject *self)
10337{
10338 if (PyUnicode_READY(self) == -1)
10339 return NULL;
10340 if (PyUnicode_IS_ASCII(self))
10341 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010342 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010343}
10344
10345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010346/* Argument converter. Coerces to a single unicode character */
10347
10348static int
10349convert_uc(PyObject *obj, void *addr)
10350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010353
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 uniobj = PyUnicode_FromObject(obj);
10355 if (uniobj == NULL) {
10356 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010357 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 return 0;
10359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 Py_DECREF(uniobj);
10364 return 0;
10365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367 Py_DECREF(uniobj);
10368 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010369}
10370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010371PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010374Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010375done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
10377static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010378unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010380 Py_ssize_t marg, left;
10381 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 Py_UCS4 fillchar = ' ';
10383
Victor Stinnere9a29352011-10-01 02:14:59 +020010384 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Benjamin Petersonbac79492012-01-14 13:34:47 -050010387 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 return NULL;
10389
Victor Stinnerc4b49542011-12-11 22:44:26 +010010390 if (PyUnicode_GET_LENGTH(self) >= width)
10391 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Victor Stinnerc4b49542011-12-11 22:44:26 +010010393 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 left = marg / 2 + (marg & width & 1);
10395
Victor Stinner9310abb2011-10-05 00:59:23 +020010396 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399/* This function assumes that str1 and str2 are readied by the caller. */
10400
Marc-André Lemburge5034372000-08-08 08:04:29 +000010401static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010402unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 int kind1, kind2;
10405 void *data1, *data2;
10406 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 kind1 = PyUnicode_KIND(str1);
10409 kind2 = PyUnicode_KIND(str2);
10410 data1 = PyUnicode_DATA(str1);
10411 data2 = PyUnicode_DATA(str2);
10412 len1 = PyUnicode_GET_LENGTH(str1);
10413 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 for (i = 0; i < len1 && i < len2; ++i) {
10416 Py_UCS4 c1, c2;
10417 c1 = PyUnicode_READ(kind1, data1, i);
10418 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010419
10420 if (c1 != c2)
10421 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010422 }
10423
10424 return (len1 < len2) ? -1 : (len1 != len2);
10425}
10426
Alexander Belopolsky40018472011-02-26 01:02:56 +000010427int
10428PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10431 if (PyUnicode_READY(left) == -1 ||
10432 PyUnicode_READY(right) == -1)
10433 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010434 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010436 PyErr_Format(PyExc_TypeError,
10437 "Can't compare %.100s and %.100s",
10438 left->ob_type->tp_name,
10439 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 return -1;
10441}
10442
Martin v. Löwis5b222132007-06-10 09:51:05 +000010443int
10444PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 Py_ssize_t i;
10447 int kind;
10448 void *data;
10449 Py_UCS4 chr;
10450
Victor Stinner910337b2011-10-03 03:20:16 +020010451 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (PyUnicode_READY(uni) == -1)
10453 return -1;
10454 kind = PyUnicode_KIND(uni);
10455 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010456 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10458 if (chr != str[i])
10459 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010460 /* This check keeps Python strings that end in '\0' from comparing equal
10461 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010464 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010466 return 0;
10467}
10468
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010469
Benjamin Peterson29060642009-01-31 22:14:21 +000010470#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010472
Alexander Belopolsky40018472011-02-26 01:02:56 +000010473PyObject *
10474PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010475{
10476 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010477
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010478 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10479 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 if (PyUnicode_READY(left) == -1 ||
10481 PyUnicode_READY(right) == -1)
10482 return NULL;
10483 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10484 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010485 if (op == Py_EQ) {
10486 Py_INCREF(Py_False);
10487 return Py_False;
10488 }
10489 if (op == Py_NE) {
10490 Py_INCREF(Py_True);
10491 return Py_True;
10492 }
10493 }
10494 if (left == right)
10495 result = 0;
10496 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010497 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010499 /* Convert the return value to a Boolean */
10500 switch (op) {
10501 case Py_EQ:
10502 v = TEST_COND(result == 0);
10503 break;
10504 case Py_NE:
10505 v = TEST_COND(result != 0);
10506 break;
10507 case Py_LE:
10508 v = TEST_COND(result <= 0);
10509 break;
10510 case Py_GE:
10511 v = TEST_COND(result >= 0);
10512 break;
10513 case Py_LT:
10514 v = TEST_COND(result == -1);
10515 break;
10516 case Py_GT:
10517 v = TEST_COND(result == 1);
10518 break;
10519 default:
10520 PyErr_BadArgument();
10521 return NULL;
10522 }
10523 Py_INCREF(v);
10524 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010526
Brian Curtindfc80e32011-08-10 20:28:54 -050010527 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010528}
10529
Alexander Belopolsky40018472011-02-26 01:02:56 +000010530int
10531PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010532{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 int kind1, kind2, kind;
10535 void *buf1, *buf2;
10536 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010537 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010538
10539 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010540 sub = PyUnicode_FromObject(element);
10541 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010542 PyErr_Format(PyExc_TypeError,
10543 "'in <string>' requires string as left operand, not %s",
10544 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010545 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010546 }
10547
Thomas Wouters477c8d52006-05-27 19:21:47 +000010548 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010549 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550 Py_DECREF(sub);
10551 return -1;
10552 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010553 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10554 Py_DECREF(sub);
10555 Py_DECREF(str);
10556 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 kind1 = PyUnicode_KIND(str);
10559 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010560 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 buf1 = PyUnicode_DATA(str);
10562 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010563 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010564 if (kind2 > kind) {
10565 Py_DECREF(sub);
10566 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010567 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010568 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (!buf2) {
10572 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010573 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 return -1;
10575 }
10576 len1 = PyUnicode_GET_LENGTH(str);
10577 len2 = PyUnicode_GET_LENGTH(sub);
10578
Benjamin Petersonead6b532011-12-20 17:23:42 -060010579 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 case PyUnicode_1BYTE_KIND:
10581 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10582 break;
10583 case PyUnicode_2BYTE_KIND:
10584 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10585 break;
10586 case PyUnicode_4BYTE_KIND:
10587 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10588 break;
10589 default:
10590 result = -1;
10591 assert(0);
10592 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593
10594 Py_DECREF(str);
10595 Py_DECREF(sub);
10596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (kind2 != kind)
10598 PyMem_Free(buf2);
10599
Guido van Rossum403d68b2000-03-13 15:55:09 +000010600 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010601}
10602
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603/* Concat to string or Unicode object giving a new Unicode object. */
10604
Alexander Belopolsky40018472011-02-26 01:02:56 +000010605PyObject *
10606PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010609 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010610 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619
10620 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010621 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010625 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 }
10629
Victor Stinner488fa492011-12-12 00:01:39 +010010630 u_len = PyUnicode_GET_LENGTH(u);
10631 v_len = PyUnicode_GET_LENGTH(v);
10632 if (u_len > PY_SSIZE_T_MAX - v_len) {
10633 PyErr_SetString(PyExc_OverflowError,
10634 "strings are too large to concat");
10635 goto onError;
10636 }
10637 new_len = u_len + v_len;
10638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010640 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010641 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010644 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010647 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10648 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 Py_DECREF(u);
10650 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010651 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 Py_XDECREF(u);
10656 Py_XDECREF(v);
10657 return NULL;
10658}
10659
Walter Dörwald1ab83302007-05-18 17:15:44 +000010660void
Victor Stinner23e56682011-10-03 03:54:37 +020010661PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010662{
Victor Stinner23e56682011-10-03 03:54:37 +020010663 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010664 Py_UCS4 maxchar, maxchar2;
10665 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010666
10667 if (p_left == NULL) {
10668 if (!PyErr_Occurred())
10669 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010670 return;
10671 }
Victor Stinner23e56682011-10-03 03:54:37 +020010672 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010673 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010674 if (!PyErr_Occurred())
10675 PyErr_BadInternalCall();
10676 goto error;
10677 }
10678
Benjamin Petersonbac79492012-01-14 13:34:47 -050010679 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010680 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010681 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010682 goto error;
10683
Victor Stinner488fa492011-12-12 00:01:39 +010010684 /* Shortcuts */
10685 if (left == unicode_empty) {
10686 Py_DECREF(left);
10687 Py_INCREF(right);
10688 *p_left = right;
10689 return;
10690 }
10691 if (right == unicode_empty)
10692 return;
10693
10694 left_len = PyUnicode_GET_LENGTH(left);
10695 right_len = PyUnicode_GET_LENGTH(right);
10696 if (left_len > PY_SSIZE_T_MAX - right_len) {
10697 PyErr_SetString(PyExc_OverflowError,
10698 "strings are too large to concat");
10699 goto error;
10700 }
10701 new_len = left_len + right_len;
10702
10703 if (unicode_modifiable(left)
10704 && PyUnicode_CheckExact(right)
10705 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010706 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10707 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010708 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010709 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010710 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10711 {
10712 /* append inplace */
10713 if (unicode_resize(p_left, new_len) != 0) {
10714 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10715 * deallocated so it cannot be put back into
10716 * 'variable'. The MemoryError is raised when there
10717 * is no value in 'variable', which might (very
10718 * remotely) be a cause of incompatibilities.
10719 */
10720 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010721 }
Victor Stinner488fa492011-12-12 00:01:39 +010010722 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010723 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010724 }
Victor Stinner488fa492011-12-12 00:01:39 +010010725 else {
10726 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10727 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010728 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010729
Victor Stinner488fa492011-12-12 00:01:39 +010010730 /* Concat the two Unicode strings */
10731 res = PyUnicode_New(new_len, maxchar);
10732 if (res == NULL)
10733 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010734 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10735 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010736 Py_DECREF(left);
10737 *p_left = res;
10738 }
10739 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010740 return;
10741
10742error:
Victor Stinner488fa492011-12-12 00:01:39 +010010743 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010744}
10745
10746void
10747PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 PyUnicode_Append(pleft, right);
10750 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010751}
10752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010753PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010756Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010757string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010758interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
10760static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010761unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010763 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010764 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010765 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 int kind1, kind2, kind;
10768 void *buf1, *buf2;
10769 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Jesus Ceaac451502011-04-20 17:09:23 +020010771 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10772 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 kind1 = PyUnicode_KIND(self);
10776 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010777 if (kind2 > kind1)
10778 return PyLong_FromLong(0);
10779 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 buf1 = PyUnicode_DATA(self);
10781 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010783 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (!buf2) {
10785 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 return NULL;
10787 }
10788 len1 = PyUnicode_GET_LENGTH(self);
10789 len2 = PyUnicode_GET_LENGTH(substring);
10790
10791 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010792 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 case PyUnicode_1BYTE_KIND:
10794 iresult = ucs1lib_count(
10795 ((Py_UCS1*)buf1) + start, end - start,
10796 buf2, len2, PY_SSIZE_T_MAX
10797 );
10798 break;
10799 case PyUnicode_2BYTE_KIND:
10800 iresult = ucs2lib_count(
10801 ((Py_UCS2*)buf1) + start, end - start,
10802 buf2, len2, PY_SSIZE_T_MAX
10803 );
10804 break;
10805 case PyUnicode_4BYTE_KIND:
10806 iresult = ucs4lib_count(
10807 ((Py_UCS4*)buf1) + start, end - start,
10808 buf2, len2, PY_SSIZE_T_MAX
10809 );
10810 break;
10811 default:
10812 assert(0); iresult = 0;
10813 }
10814
10815 result = PyLong_FromSsize_t(iresult);
10816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (kind2 != kind)
10818 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 return result;
10823}
10824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010825PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010826 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010828Encode S using the codec registered for encoding. Default encoding\n\
10829is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010830handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010831a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10832'xmlcharrefreplace' as well as any other name registered with\n\
10833codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010836unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010838 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 char *encoding = NULL;
10840 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010841
Benjamin Peterson308d6372009-09-18 21:42:35 +000010842 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10843 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010846}
10847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010848PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850\n\
10851Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010852If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
10854static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010855unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 Py_ssize_t i, j, line_pos, src_len, incr;
10858 Py_UCS4 ch;
10859 PyObject *u;
10860 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010863 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
10865 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867
Antoine Pitrou22425222011-10-04 19:10:51 +020010868 if (PyUnicode_READY(self) == -1)
10869 return NULL;
10870
Thomas Wouters7e474022000-07-16 12:04:32 +000010871 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 src_len = PyUnicode_GET_LENGTH(self);
10873 i = j = line_pos = 0;
10874 kind = PyUnicode_KIND(self);
10875 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 for (; i < src_len; i++) {
10878 ch = PyUnicode_READ(kind, src_data, i);
10879 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010880 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 goto overflow;
10885 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 goto overflow;
10892 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010894 if (ch == '\n' || ch == '\r')
10895 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010898 if (!found)
10899 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010900
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 if (!u)
10904 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 for (; i < src_len; i++) {
10910 ch = PyUnicode_READ(kind, src_data, i);
10911 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010913 incr = tabsize - (line_pos % tabsize);
10914 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010915 FILL(kind, dest_data, ' ', j, incr);
10916 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010918 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 line_pos++;
10921 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010922 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 if (ch == '\n' || ch == '\r')
10924 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 }
10927 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010928 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010929
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010931 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933}
10934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937\n\
10938Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010939such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940arguments start and end are interpreted as in slice notation.\n\
10941\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010942Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
10944static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010947 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010948 Py_ssize_t start;
10949 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010950 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
Jesus Ceaac451502011-04-20 17:09:23 +020010952 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10953 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (PyUnicode_READY(self) == -1)
10957 return NULL;
10958 if (PyUnicode_READY(substring) == -1)
10959 return NULL;
10960
Victor Stinner7931d9a2011-11-04 00:22:48 +010010961 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 if (result == -2)
10966 return NULL;
10967
Christian Heimes217cfd12007-12-02 14:31:20 +000010968 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969}
10970
10971static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010972unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010974 void *data;
10975 enum PyUnicode_Kind kind;
10976 Py_UCS4 ch;
10977 PyObject *res;
10978
10979 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10980 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010982 }
10983 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10984 PyErr_SetString(PyExc_IndexError, "string index out of range");
10985 return NULL;
10986 }
10987 kind = PyUnicode_KIND(self);
10988 data = PyUnicode_DATA(self);
10989 ch = PyUnicode_READ(kind, data, index);
10990 if (ch < 256)
10991 return get_latin1_char(ch);
10992
10993 res = PyUnicode_New(1, ch);
10994 if (res == NULL)
10995 return NULL;
10996 kind = PyUnicode_KIND(res);
10997 data = PyUnicode_DATA(res);
10998 PyUnicode_WRITE(kind, data, 0, ch);
10999 assert(_PyUnicode_CheckConsistency(res, 1));
11000 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001}
11002
Guido van Rossumc2504932007-09-18 19:42:40 +000011003/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011004 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011005static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007{
Guido van Rossumc2504932007-09-18 19:42:40 +000011008 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011009 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011010
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011011#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011012 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011013#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (_PyUnicode_HASH(self) != -1)
11015 return _PyUnicode_HASH(self);
11016 if (PyUnicode_READY(self) == -1)
11017 return -1;
11018 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011019 /*
11020 We make the hash of the empty string be 0, rather than using
11021 (prefix ^ suffix), since this slightly obfuscates the hash secret
11022 */
11023 if (len == 0) {
11024 _PyUnicode_HASH(self) = 0;
11025 return 0;
11026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027
11028 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011029#define HASH(P) \
11030 x ^= (Py_uhash_t) *P << 7; \
11031 while (--len >= 0) \
11032 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033
Georg Brandl2fb477c2012-02-21 00:33:36 +010011034 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 switch (PyUnicode_KIND(self)) {
11036 case PyUnicode_1BYTE_KIND: {
11037 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11038 HASH(c);
11039 break;
11040 }
11041 case PyUnicode_2BYTE_KIND: {
11042 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11043 HASH(s);
11044 break;
11045 }
11046 default: {
11047 Py_UCS4 *l;
11048 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11049 "Impossible switch case in unicode_hash");
11050 l = PyUnicode_4BYTE_DATA(self);
11051 HASH(l);
11052 break;
11053 }
11054 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011055 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11056 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057
Guido van Rossumc2504932007-09-18 19:42:40 +000011058 if (x == -1)
11059 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011061 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011065PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011068Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011073 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011074 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011075 Py_ssize_t start;
11076 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Jesus Ceaac451502011-04-20 17:09:23 +020011078 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11079 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (PyUnicode_READY(self) == -1)
11083 return NULL;
11084 if (PyUnicode_READY(substring) == -1)
11085 return NULL;
11086
Victor Stinner7931d9a2011-11-04 00:22:48 +010011087 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (result == -2)
11092 return NULL;
11093
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 if (result < 0) {
11095 PyErr_SetString(PyExc_ValueError, "substring not found");
11096 return NULL;
11097 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098
Christian Heimes217cfd12007-12-02 14:31:20 +000011099 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100}
11101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011102PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011103 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011105Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107
11108static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011109unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 Py_ssize_t i, length;
11112 int kind;
11113 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 int cased;
11115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 if (PyUnicode_READY(self) == -1)
11117 return NULL;
11118 length = PyUnicode_GET_LENGTH(self);
11119 kind = PyUnicode_KIND(self);
11120 data = PyUnicode_DATA(self);
11121
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (length == 1)
11124 return PyBool_FromLong(
11125 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011127 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011130
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 for (i = 0; i < length; i++) {
11133 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011134
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11136 return PyBool_FromLong(0);
11137 else if (!cased && Py_UNICODE_ISLOWER(ch))
11138 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011140 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141}
11142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011146Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011147at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
11149static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 Py_ssize_t i, length;
11153 int kind;
11154 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 int cased;
11156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (PyUnicode_READY(self) == -1)
11158 return NULL;
11159 length = PyUnicode_GET_LENGTH(self);
11160 kind = PyUnicode_KIND(self);
11161 data = PyUnicode_DATA(self);
11162
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (length == 1)
11165 return PyBool_FromLong(
11166 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011168 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 for (i = 0; i < length; i++) {
11174 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011175
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11177 return PyBool_FromLong(0);
11178 else if (!cased && Py_UNICODE_ISUPPER(ch))
11179 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182}
11183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011184PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011187Return True if S is a titlecased string and there is at least one\n\
11188character in S, i.e. upper- and titlecase characters may only\n\
11189follow uncased characters and lowercase characters only cased ones.\n\
11190Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011193unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 Py_ssize_t i, length;
11196 int kind;
11197 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 int cased, previous_is_cased;
11199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (PyUnicode_READY(self) == -1)
11201 return NULL;
11202 length = PyUnicode_GET_LENGTH(self);
11203 kind = PyUnicode_KIND(self);
11204 data = PyUnicode_DATA(self);
11205
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (length == 1) {
11208 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11209 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11210 (Py_UNICODE_ISUPPER(ch) != 0));
11211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011213 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011216
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 cased = 0;
11218 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 for (i = 0; i < length; i++) {
11220 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011221
Benjamin Peterson29060642009-01-31 22:14:21 +000011222 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11223 if (previous_is_cased)
11224 return PyBool_FromLong(0);
11225 previous_is_cased = 1;
11226 cased = 1;
11227 }
11228 else if (Py_UNICODE_ISLOWER(ch)) {
11229 if (!previous_is_cased)
11230 return PyBool_FromLong(0);
11231 previous_is_cased = 1;
11232 cased = 1;
11233 }
11234 else
11235 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011237 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238}
11239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011243Return True if all characters in S are whitespace\n\
11244and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
11246static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011247unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 Py_ssize_t i, length;
11250 int kind;
11251 void *data;
11252
11253 if (PyUnicode_READY(self) == -1)
11254 return NULL;
11255 length = PyUnicode_GET_LENGTH(self);
11256 kind = PyUnicode_KIND(self);
11257 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (length == 1)
11261 return PyBool_FromLong(
11262 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011264 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 for (i = 0; i < length; i++) {
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011270 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011273 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274}
11275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011279Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281
11282static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011283unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t i, length;
11286 int kind;
11287 void *data;
11288
11289 if (PyUnicode_READY(self) == -1)
11290 return NULL;
11291 length = PyUnicode_GET_LENGTH(self);
11292 kind = PyUnicode_KIND(self);
11293 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011294
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011295 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if (length == 1)
11297 return PyBool_FromLong(
11298 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299
11300 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 for (i = 0; i < length; i++) {
11305 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011307 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011308 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309}
11310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011311PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011314Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011315and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011316
11317static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011318unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 int kind;
11321 void *data;
11322 Py_ssize_t len, i;
11323
11324 if (PyUnicode_READY(self) == -1)
11325 return NULL;
11326
11327 kind = PyUnicode_KIND(self);
11328 data = PyUnicode_DATA(self);
11329 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011331 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (len == 1) {
11333 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11334 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11335 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336
11337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 for (i = 0; i < len; i++) {
11342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011343 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011352Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011387Return True if all characters in S are digits\n\
11388and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389
11390static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011391unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 Py_ssize_t i, length;
11394 int kind;
11395 void *data;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399 length = PyUnicode_GET_LENGTH(self);
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (length == 1) {
11405 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11406 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011409 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 for (i = 0; i < length; i++) {
11414 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011423Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 Py_ssize_t i, length;
11430 int kind;
11431 void *data;
11432
11433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435 length = PyUnicode_GET_LENGTH(self);
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (length == 1)
11441 return PyBool_FromLong(
11442 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011444 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 for (i = 0; i < length; i++) {
11449 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453}
11454
Martin v. Löwis47383402007-08-15 07:32:56 +000011455int
11456PyUnicode_IsIdentifier(PyObject *self)
11457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 int kind;
11459 void *data;
11460 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011461 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (PyUnicode_READY(self) == -1) {
11464 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 }
11467
11468 /* Special case for empty strings */
11469 if (PyUnicode_GET_LENGTH(self) == 0)
11470 return 0;
11471 kind = PyUnicode_KIND(self);
11472 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011473
11474 /* PEP 3131 says that the first character must be in
11475 XID_Start and subsequent characters in XID_Continue,
11476 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011477 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011478 letters, digits, underscore). However, given the current
11479 definition of XID_Start and XID_Continue, it is sufficient
11480 to check just for these, except that _ must be allowed
11481 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011483 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011484 return 0;
11485
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011486 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011489 return 1;
11490}
11491
11492PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011494\n\
11495Return True if S is a valid identifier according\n\
11496to the language definition.");
11497
11498static PyObject*
11499unicode_isidentifier(PyObject *self)
11500{
11501 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11502}
11503
Georg Brandl559e5d72008-06-11 18:37:52 +000011504PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011506\n\
11507Return True if all characters in S are considered\n\
11508printable in repr() or S is empty, False otherwise.");
11509
11510static PyObject*
11511unicode_isprintable(PyObject *self)
11512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 Py_ssize_t i, length;
11514 int kind;
11515 void *data;
11516
11517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519 length = PyUnicode_GET_LENGTH(self);
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011522
11523 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (length == 1)
11525 return PyBool_FromLong(
11526 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 for (i = 0; i < length; i++) {
11529 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011530 Py_RETURN_FALSE;
11531 }
11532 }
11533 Py_RETURN_TRUE;
11534}
11535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011536PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011537 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538\n\
11539Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011540iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
11542static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011543unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011545 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546}
11547
Martin v. Löwis18e16552006-02-15 17:27:45 +000011548static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011549unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (PyUnicode_READY(self) == -1)
11552 return -1;
11553 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554}
11555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011556PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011559Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011560done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561
11562static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011563unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011565 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 Py_UCS4 fillchar = ' ';
11567
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011568 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569 return NULL;
11570
Benjamin Petersonbac79492012-01-14 13:34:47 -050011571 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Victor Stinnerc4b49542011-12-11 22:44:26 +010011574 if (PyUnicode_GET_LENGTH(self) >= width)
11575 return unicode_result_unchanged(self);
11576
11577 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578}
11579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
11585static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011586unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011588 if (PyUnicode_READY(self) == -1)
11589 return NULL;
11590 if (PyUnicode_IS_ASCII(self))
11591 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011592 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593}
11594
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595#define LEFTSTRIP 0
11596#define RIGHTSTRIP 1
11597#define BOTHSTRIP 2
11598
11599/* Arrays indexed by above */
11600static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11601
11602#define STRIPNAME(i) (stripformat[i]+3)
11603
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011604/* externally visible for str.strip(unicode) */
11605PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011606_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 void *data;
11609 int kind;
11610 Py_ssize_t i, j, len;
11611 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11614 return NULL;
11615
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
11618 len = PyUnicode_GET_LENGTH(self);
11619 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11620 PyUnicode_DATA(sepobj),
11621 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011622
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 i = 0;
11624 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 while (i < len &&
11626 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 i++;
11628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011629 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630
Benjamin Peterson14339b62009-01-31 16:36:08 +000011631 j = len;
11632 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 do {
11634 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 } while (j >= i &&
11636 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011638 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639
Victor Stinner7931d9a2011-11-04 00:22:48 +010011640 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641}
11642
11643PyObject*
11644PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11645{
11646 unsigned char *data;
11647 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011648 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649
Victor Stinnerde636f32011-10-01 03:55:54 +020011650 if (PyUnicode_READY(self) == -1)
11651 return NULL;
11652
Victor Stinner684d5fd2012-05-03 02:32:34 +020011653 length = PyUnicode_GET_LENGTH(self);
11654 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011655
Victor Stinner684d5fd2012-05-03 02:32:34 +020011656 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011657 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658
Victor Stinnerde636f32011-10-01 03:55:54 +020011659 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011660 PyErr_SetString(PyExc_IndexError, "string index out of range");
11661 return NULL;
11662 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011663 if (start >= length || end < start)
11664 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011665
Victor Stinner684d5fd2012-05-03 02:32:34 +020011666 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011667 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011668 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011669 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011670 }
11671 else {
11672 kind = PyUnicode_KIND(self);
11673 data = PyUnicode_1BYTE_DATA(self);
11674 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011675 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011676 length);
11677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
11680static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011681do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 int kind;
11684 void *data;
11685 Py_ssize_t len, i, j;
11686
11687 if (PyUnicode_READY(self) == -1)
11688 return NULL;
11689
11690 kind = PyUnicode_KIND(self);
11691 data = PyUnicode_DATA(self);
11692 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011693
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 i = 0;
11695 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 i++;
11698 }
11699 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011700
Benjamin Peterson14339b62009-01-31 16:36:08 +000011701 j = len;
11702 if (striptype != LEFTSTRIP) {
11703 do {
11704 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 j++;
11707 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708
Victor Stinner7931d9a2011-11-04 00:22:48 +010011709 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710}
11711
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712
11713static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011714do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11719 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011720
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 if (sep != NULL && sep != Py_None) {
11722 if (PyUnicode_Check(sep))
11723 return _PyUnicode_XStrip(self, striptype, sep);
11724 else {
11725 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 "%s arg must be None or str",
11727 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 return NULL;
11729 }
11730 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011731
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011733}
11734
11735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011736PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738\n\
11739Return a copy of the string S with leading and trailing\n\
11740whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011741If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742
11743static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011746 if (PyTuple_GET_SIZE(args) == 0)
11747 return do_strip(self, BOTHSTRIP); /* Common case */
11748 else
11749 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750}
11751
11752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011753PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755\n\
11756Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011757If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758
11759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 if (PyTuple_GET_SIZE(args) == 0)
11763 return do_strip(self, LEFTSTRIP); /* Common case */
11764 else
11765 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766}
11767
11768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011771\n\
11772Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011773If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774
11775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011776unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011777{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011778 if (PyTuple_GET_SIZE(args) == 0)
11779 return do_strip(self, RIGHTSTRIP); /* Common case */
11780 else
11781 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782}
11783
11784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011788 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Serhiy Storchaka05997252013-01-26 12:14:02 +020011791 if (len < 1)
11792 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Victor Stinnerc4b49542011-12-11 22:44:26 +010011794 /* no repeat, return original string */
11795 if (len == 1)
11796 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011797
Benjamin Petersonbac79492012-01-14 13:34:47 -050011798 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 return NULL;
11800
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011801 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011802 PyErr_SetString(PyExc_OverflowError,
11803 "repeated string is too long");
11804 return NULL;
11805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011807
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 if (!u)
11810 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011811 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (PyUnicode_GET_LENGTH(str) == 1) {
11814 const int kind = PyUnicode_KIND(str);
11815 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011816 if (kind == PyUnicode_1BYTE_KIND) {
11817 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011818 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011819 }
11820 else if (kind == PyUnicode_2BYTE_KIND) {
11821 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011822 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011823 ucs2[n] = fill_char;
11824 } else {
11825 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11826 assert(kind == PyUnicode_4BYTE_KIND);
11827 for (n = 0; n < len; ++n)
11828 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 }
11831 else {
11832 /* number of characters copied this far */
11833 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011834 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 char *to = (char *) PyUnicode_DATA(u);
11836 Py_MEMCPY(to, PyUnicode_DATA(str),
11837 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 n = (done <= nchars-done) ? done : nchars-done;
11840 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011841 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 }
11844
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011845 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847}
11848
Alexander Belopolsky40018472011-02-26 01:02:56 +000011849PyObject *
11850PyUnicode_Replace(PyObject *obj,
11851 PyObject *subobj,
11852 PyObject *replobj,
11853 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854{
11855 PyObject *self;
11856 PyObject *str1;
11857 PyObject *str2;
11858 PyObject *result;
11859
11860 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011861 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011864 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 Py_DECREF(self);
11866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 }
11868 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011869 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 Py_DECREF(self);
11871 Py_DECREF(str1);
11872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011874 if (PyUnicode_READY(self) == -1 ||
11875 PyUnicode_READY(str1) == -1 ||
11876 PyUnicode_READY(str2) == -1)
11877 result = NULL;
11878 else
11879 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 Py_DECREF(self);
11881 Py_DECREF(str1);
11882 Py_DECREF(str2);
11883 return result;
11884}
11885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011887 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888\n\
11889Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011890old replaced by new. If the optional argument count is\n\
11891given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
11893static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 PyObject *str1;
11897 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011898 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 PyObject *result;
11900
Martin v. Löwis18e16552006-02-15 17:27:45 +000011901 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011903 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011906 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 return NULL;
11908 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011909 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 Py_DECREF(str1);
11911 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011912 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011913 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11914 result = NULL;
11915 else
11916 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
11918 Py_DECREF(str1);
11919 Py_DECREF(str2);
11920 return result;
11921}
11922
Alexander Belopolsky40018472011-02-26 01:02:56 +000011923static PyObject *
11924unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011926 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 Py_ssize_t isize;
11928 Py_ssize_t osize, squote, dquote, i, o;
11929 Py_UCS4 max, quote;
11930 int ikind, okind;
11931 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011934 return NULL;
11935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 isize = PyUnicode_GET_LENGTH(unicode);
11937 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 /* Compute length of output, quote characters, and
11940 maximum character */
11941 osize = 2; /* quotes */
11942 max = 127;
11943 squote = dquote = 0;
11944 ikind = PyUnicode_KIND(unicode);
11945 for (i = 0; i < isize; i++) {
11946 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11947 switch (ch) {
11948 case '\'': squote++; osize++; break;
11949 case '"': dquote++; osize++; break;
11950 case '\\': case '\t': case '\r': case '\n':
11951 osize += 2; break;
11952 default:
11953 /* Fast-path ASCII */
11954 if (ch < ' ' || ch == 0x7f)
11955 osize += 4; /* \xHH */
11956 else if (ch < 0x7f)
11957 osize++;
11958 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11959 osize++;
11960 max = ch > max ? ch : max;
11961 }
11962 else if (ch < 0x100)
11963 osize += 4; /* \xHH */
11964 else if (ch < 0x10000)
11965 osize += 6; /* \uHHHH */
11966 else
11967 osize += 10; /* \uHHHHHHHH */
11968 }
11969 }
11970
11971 quote = '\'';
11972 if (squote) {
11973 if (dquote)
11974 /* Both squote and dquote present. Use squote,
11975 and escape them */
11976 osize += squote;
11977 else
11978 quote = '"';
11979 }
11980
11981 repr = PyUnicode_New(osize, max);
11982 if (repr == NULL)
11983 return NULL;
11984 okind = PyUnicode_KIND(repr);
11985 odata = PyUnicode_DATA(repr);
11986
11987 PyUnicode_WRITE(okind, odata, 0, quote);
11988 PyUnicode_WRITE(okind, odata, osize-1, quote);
11989
11990 for (i = 0, o = 1; i < isize; i++) {
11991 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992
11993 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 if ((ch == quote) || (ch == '\\')) {
11995 PyUnicode_WRITE(okind, odata, o++, '\\');
11996 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011997 continue;
11998 }
11999
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012001 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 PyUnicode_WRITE(okind, odata, o++, '\\');
12003 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012004 }
12005 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 PyUnicode_WRITE(okind, odata, o++, '\\');
12007 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012008 }
12009 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 PyUnicode_WRITE(okind, odata, o++, '\\');
12011 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012012 }
12013
12014 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012015 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 PyUnicode_WRITE(okind, odata, o++, '\\');
12017 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012018 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12019 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012020 }
12021
Georg Brandl559e5d72008-06-11 18:37:52 +000012022 /* Copy ASCII characters as-is */
12023 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012025 }
12026
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012028 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012030 (categories Z* and C* except ASCII space)
12031 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012033 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012034 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12038 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012039 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012040 /* Map 16-bit characters to '\uxxxx' */
12041 else if (ch <= 0xffff) {
12042 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12045 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12046 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012047 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012048 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012049 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012050 PyUnicode_WRITE(okind, odata, o++, 'U');
12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012059 }
12060 }
12061 /* Copy characters as-is */
12062 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012064 }
12065 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012068 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012069 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070}
12071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012072PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074\n\
12075Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012076such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077arguments start and end are interpreted as in slice notation.\n\
12078\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012079Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
12081static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012084 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012085 Py_ssize_t start;
12086 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012087 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Jesus Ceaac451502011-04-20 17:09:23 +020012089 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12090 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (PyUnicode_READY(self) == -1)
12094 return NULL;
12095 if (PyUnicode_READY(substring) == -1)
12096 return NULL;
12097
Victor Stinner7931d9a2011-11-04 00:22:48 +010012098 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
12100 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (result == -2)
12103 return NULL;
12104
Christian Heimes217cfd12007-12-02 14:31:20 +000012105 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106}
12107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012108PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
12113static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012116 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012117 Py_ssize_t start;
12118 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012119 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Jesus Ceaac451502011-04-20 17:09:23 +020012121 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12122 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 if (PyUnicode_READY(self) == -1)
12126 return NULL;
12127 if (PyUnicode_READY(substring) == -1)
12128 return NULL;
12129
Victor Stinner7931d9a2011-11-04 00:22:48 +010012130 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131
12132 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (result == -2)
12135 return NULL;
12136
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137 if (result < 0) {
12138 PyErr_SetString(PyExc_ValueError, "substring not found");
12139 return NULL;
12140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141
Christian Heimes217cfd12007-12-02 14:31:20 +000012142 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143}
12144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012148Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012149done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150
12151static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012152unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012154 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 Py_UCS4 fillchar = ' ';
12156
Victor Stinnere9a29352011-10-01 02:14:59 +020012157 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012159
Benjamin Petersonbac79492012-01-14 13:34:47 -050012160 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161 return NULL;
12162
Victor Stinnerc4b49542011-12-11 22:44:26 +010012163 if (PyUnicode_GET_LENGTH(self) >= width)
12164 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
Victor Stinnerc4b49542011-12-11 22:44:26 +010012166 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167}
12168
Alexander Belopolsky40018472011-02-26 01:02:56 +000012169PyObject *
12170PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
12172 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012173
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174 s = PyUnicode_FromObject(s);
12175 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012176 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 if (sep != NULL) {
12178 sep = PyUnicode_FromObject(sep);
12179 if (sep == NULL) {
12180 Py_DECREF(s);
12181 return NULL;
12182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 }
12184
Victor Stinner9310abb2011-10-05 00:59:23 +020012185 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
12187 Py_DECREF(s);
12188 Py_XDECREF(sep);
12189 return result;
12190}
12191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012192PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012193 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194\n\
12195Return a list of the words in S, using sep as the\n\
12196delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012197splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012198whitespace string is a separator and empty strings are\n\
12199removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
12201static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012202unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012204 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012206 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012208 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12209 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 return NULL;
12211
12212 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012215 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012217 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218}
12219
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220PyObject *
12221PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12222{
12223 PyObject* str_obj;
12224 PyObject* sep_obj;
12225 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 int kind1, kind2, kind;
12227 void *buf1 = NULL, *buf2 = NULL;
12228 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012229
12230 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012231 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012233 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012234 if (!sep_obj) {
12235 Py_DECREF(str_obj);
12236 return NULL;
12237 }
12238 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12239 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012240 Py_DECREF(str_obj);
12241 return NULL;
12242 }
12243
Victor Stinner14f8f022011-10-05 20:58:25 +020012244 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012246 kind = Py_MAX(kind1, kind2);
12247 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012249 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 if (!buf1)
12251 goto onError;
12252 buf2 = PyUnicode_DATA(sep_obj);
12253 if (kind2 != kind)
12254 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12255 if (!buf2)
12256 goto onError;
12257 len1 = PyUnicode_GET_LENGTH(str_obj);
12258 len2 = PyUnicode_GET_LENGTH(sep_obj);
12259
Benjamin Petersonead6b532011-12-20 17:23:42 -060012260 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012262 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12263 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12264 else
12265 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 break;
12267 case PyUnicode_2BYTE_KIND:
12268 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12269 break;
12270 case PyUnicode_4BYTE_KIND:
12271 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12272 break;
12273 default:
12274 assert(0);
12275 out = 0;
12276 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012277
12278 Py_DECREF(sep_obj);
12279 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 if (kind1 != kind)
12281 PyMem_Free(buf1);
12282 if (kind2 != kind)
12283 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012284
12285 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 onError:
12287 Py_DECREF(sep_obj);
12288 Py_DECREF(str_obj);
12289 if (kind1 != kind && buf1)
12290 PyMem_Free(buf1);
12291 if (kind2 != kind && buf2)
12292 PyMem_Free(buf2);
12293 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294}
12295
12296
12297PyObject *
12298PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12299{
12300 PyObject* str_obj;
12301 PyObject* sep_obj;
12302 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 int kind1, kind2, kind;
12304 void *buf1 = NULL, *buf2 = NULL;
12305 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306
12307 str_obj = PyUnicode_FromObject(str_in);
12308 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310 sep_obj = PyUnicode_FromObject(sep_in);
12311 if (!sep_obj) {
12312 Py_DECREF(str_obj);
12313 return NULL;
12314 }
12315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 kind1 = PyUnicode_KIND(str_in);
12317 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012318 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 buf1 = PyUnicode_DATA(str_in);
12320 if (kind1 != kind)
12321 buf1 = _PyUnicode_AsKind(str_in, kind);
12322 if (!buf1)
12323 goto onError;
12324 buf2 = PyUnicode_DATA(sep_obj);
12325 if (kind2 != kind)
12326 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12327 if (!buf2)
12328 goto onError;
12329 len1 = PyUnicode_GET_LENGTH(str_obj);
12330 len2 = PyUnicode_GET_LENGTH(sep_obj);
12331
Benjamin Petersonead6b532011-12-20 17:23:42 -060012332 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012334 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12335 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12336 else
12337 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 break;
12339 case PyUnicode_2BYTE_KIND:
12340 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12341 break;
12342 case PyUnicode_4BYTE_KIND:
12343 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12344 break;
12345 default:
12346 assert(0);
12347 out = 0;
12348 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012349
12350 Py_DECREF(sep_obj);
12351 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 if (kind1 != kind)
12353 PyMem_Free(buf1);
12354 if (kind2 != kind)
12355 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012356
12357 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 onError:
12359 Py_DECREF(sep_obj);
12360 Py_DECREF(str_obj);
12361 if (kind1 != kind && buf1)
12362 PyMem_Free(buf1);
12363 if (kind2 != kind && buf2)
12364 PyMem_Free(buf2);
12365 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012366}
12367
12368PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012370\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012371Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012373found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012374
12375static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012376unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012377{
Victor Stinner9310abb2011-10-05 00:59:23 +020012378 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379}
12380
12381PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012382 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012384Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012385the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012386separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012387
12388static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012389unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390{
Victor Stinner9310abb2011-10-05 00:59:23 +020012391 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392}
12393
Alexander Belopolsky40018472011-02-26 01:02:56 +000012394PyObject *
12395PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012396{
12397 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012399 s = PyUnicode_FromObject(s);
12400 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 if (sep != NULL) {
12403 sep = PyUnicode_FromObject(sep);
12404 if (sep == NULL) {
12405 Py_DECREF(s);
12406 return NULL;
12407 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012408 }
12409
Victor Stinner9310abb2011-10-05 00:59:23 +020012410 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012411
12412 Py_DECREF(s);
12413 Py_XDECREF(sep);
12414 return result;
12415}
12416
12417PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012418 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012419\n\
12420Return a list of the words in S, using sep as the\n\
12421delimiter string, starting at the end of the string and\n\
12422working to the front. If maxsplit is given, at most maxsplit\n\
12423splits are done. If sep is not specified, any whitespace string\n\
12424is a separator.");
12425
12426static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012427unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012428{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012429 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012430 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012431 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012432
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012433 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12434 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012435 return NULL;
12436
12437 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012439 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012440 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012441 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012442 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012443}
12444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012445PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447\n\
12448Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012449Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012450is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451
12452static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012453unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012455 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012456 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012458 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12459 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460 return NULL;
12461
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012462 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463}
12464
12465static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012466PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012468 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469}
12470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012471PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473\n\
12474Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012475and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476
12477static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012478unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012480 if (PyUnicode_READY(self) == -1)
12481 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012482 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483}
12484
Georg Brandlceee0772007-11-27 23:48:05 +000012485PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012487\n\
12488Return a translation table usable for str.translate().\n\
12489If there is only one argument, it must be a dictionary mapping Unicode\n\
12490ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012491Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012492If there are two arguments, they must be strings of equal length, and\n\
12493in the resulting dictionary, each character in x will be mapped to the\n\
12494character at the same position in y. If there is a third argument, it\n\
12495must be a string, whose characters will be mapped to None in the result.");
12496
12497static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012498unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012499{
12500 PyObject *x, *y = NULL, *z = NULL;
12501 PyObject *new = NULL, *key, *value;
12502 Py_ssize_t i = 0;
12503 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012504
Georg Brandlceee0772007-11-27 23:48:05 +000012505 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12506 return NULL;
12507 new = PyDict_New();
12508 if (!new)
12509 return NULL;
12510 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 int x_kind, y_kind, z_kind;
12512 void *x_data, *y_data, *z_data;
12513
Georg Brandlceee0772007-11-27 23:48:05 +000012514 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012515 if (!PyUnicode_Check(x)) {
12516 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12517 "be a string if there is a second argument");
12518 goto err;
12519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012521 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12522 "arguments must have equal length");
12523 goto err;
12524 }
12525 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 x_kind = PyUnicode_KIND(x);
12527 y_kind = PyUnicode_KIND(y);
12528 x_data = PyUnicode_DATA(x);
12529 y_data = PyUnicode_DATA(y);
12530 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12531 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012532 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012533 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012534 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012535 if (!value) {
12536 Py_DECREF(key);
12537 goto err;
12538 }
Georg Brandlceee0772007-11-27 23:48:05 +000012539 res = PyDict_SetItem(new, key, value);
12540 Py_DECREF(key);
12541 Py_DECREF(value);
12542 if (res < 0)
12543 goto err;
12544 }
12545 /* create entries for deleting chars in z */
12546 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 z_kind = PyUnicode_KIND(z);
12548 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012549 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012551 if (!key)
12552 goto err;
12553 res = PyDict_SetItem(new, key, Py_None);
12554 Py_DECREF(key);
12555 if (res < 0)
12556 goto err;
12557 }
12558 }
12559 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 int kind;
12561 void *data;
12562
Georg Brandlceee0772007-11-27 23:48:05 +000012563 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012564 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012565 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12566 "to maketrans it must be a dict");
12567 goto err;
12568 }
12569 /* copy entries into the new dict, converting string keys to int keys */
12570 while (PyDict_Next(x, &i, &key, &value)) {
12571 if (PyUnicode_Check(key)) {
12572 /* convert string keys to integer keys */
12573 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012574 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012575 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12576 "table must be of length 1");
12577 goto err;
12578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 kind = PyUnicode_KIND(key);
12580 data = PyUnicode_DATA(key);
12581 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012582 if (!newkey)
12583 goto err;
12584 res = PyDict_SetItem(new, newkey, value);
12585 Py_DECREF(newkey);
12586 if (res < 0)
12587 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012588 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012589 /* just keep integer keys */
12590 if (PyDict_SetItem(new, key, value) < 0)
12591 goto err;
12592 } else {
12593 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12594 "be strings or integers");
12595 goto err;
12596 }
12597 }
12598 }
12599 return new;
12600 err:
12601 Py_DECREF(new);
12602 return NULL;
12603}
12604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012605PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607\n\
12608Return a copy of the string S, where all characters have been mapped\n\
12609through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012610Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012611Unmapped characters are left untouched. Characters mapped to None\n\
12612are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613
12614static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618}
12619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012620PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012623Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
12625static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012626unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012628 if (PyUnicode_READY(self) == -1)
12629 return NULL;
12630 if (PyUnicode_IS_ASCII(self))
12631 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012632 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633}
12634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012635PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012638Pad a numeric string S with zeros on the left, to fill a field\n\
12639of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640
12641static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012642unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012644 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012645 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012646 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 int kind;
12648 void *data;
12649 Py_UCS4 chr;
12650
Martin v. Löwis18e16552006-02-15 17:27:45 +000012651 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652 return NULL;
12653
Benjamin Petersonbac79492012-01-14 13:34:47 -050012654 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
Victor Stinnerc4b49542011-12-11 22:44:26 +010012657 if (PyUnicode_GET_LENGTH(self) >= width)
12658 return unicode_result_unchanged(self);
12659
12660 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
12662 u = pad(self, fill, 0, '0');
12663
Walter Dörwald068325e2002-04-15 13:36:47 +000012664 if (u == NULL)
12665 return NULL;
12666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 kind = PyUnicode_KIND(u);
12668 data = PyUnicode_DATA(u);
12669 chr = PyUnicode_READ(kind, data, fill);
12670
12671 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 PyUnicode_WRITE(kind, data, 0, chr);
12674 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675 }
12676
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012677 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012678 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680
12681#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012682static PyObject *
12683unicode__decimal2ascii(PyObject *self)
12684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012686}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687#endif
12688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012692Return True if S starts with the specified prefix, False otherwise.\n\
12693With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012694With optional end, stop comparing S at that position.\n\
12695prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696
12697static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012698unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012701 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012702 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012703 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012704 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012705 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
Jesus Ceaac451502011-04-20 17:09:23 +020012707 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012709 if (PyTuple_Check(subobj)) {
12710 Py_ssize_t i;
12711 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012712 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012713 if (substring == NULL)
12714 return NULL;
12715 result = tailmatch(self, substring, start, end, -1);
12716 Py_DECREF(substring);
12717 if (result) {
12718 Py_RETURN_TRUE;
12719 }
12720 }
12721 /* nothing matched */
12722 Py_RETURN_FALSE;
12723 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012724 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012725 if (substring == NULL) {
12726 if (PyErr_ExceptionMatches(PyExc_TypeError))
12727 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12728 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012730 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012731 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012733 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734}
12735
12736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012737PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012740Return True if S ends with the specified suffix, False otherwise.\n\
12741With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012742With optional end, stop comparing S at that position.\n\
12743suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
12745static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012746unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012749 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012750 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012751 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012752 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012753 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012754
Jesus Ceaac451502011-04-20 17:09:23 +020012755 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012757 if (PyTuple_Check(subobj)) {
12758 Py_ssize_t i;
12759 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012760 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012762 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012764 result = tailmatch(self, substring, start, end, +1);
12765 Py_DECREF(substring);
12766 if (result) {
12767 Py_RETURN_TRUE;
12768 }
12769 }
12770 Py_RETURN_FALSE;
12771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012772 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012773 if (substring == NULL) {
12774 if (PyErr_ExceptionMatches(PyExc_TypeError))
12775 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12776 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012778 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782}
12783
Victor Stinner202fdca2012-05-07 12:47:02 +020012784Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012785_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012786{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012787 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012788 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12789 writer->data = PyUnicode_DATA(writer->buffer);
12790 writer->kind = PyUnicode_KIND(writer->buffer);
12791}
12792
Victor Stinnerd3f08822012-05-29 12:57:52 +020012793void
12794_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012795{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012796 memset(writer, 0, sizeof(*writer));
12797#ifdef Py_DEBUG
12798 writer->kind = 5; /* invalid kind */
12799#endif
12800 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012801 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012802}
12803
Victor Stinnerd3f08822012-05-29 12:57:52 +020012804int
12805_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12806 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012807{
12808 Py_ssize_t newlen;
12809 PyObject *newbuffer;
12810
Victor Stinnerd3f08822012-05-29 12:57:52 +020012811 assert(length > 0);
12812
Victor Stinner202fdca2012-05-07 12:47:02 +020012813 if (length > PY_SSIZE_T_MAX - writer->pos) {
12814 PyErr_NoMemory();
12815 return -1;
12816 }
12817 newlen = writer->pos + length;
12818
Victor Stinnerd3f08822012-05-29 12:57:52 +020012819 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012820 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012821 /* overallocate 25% to limit the number of resize */
12822 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12823 newlen += newlen / 4;
12824 if (newlen < writer->min_length)
12825 newlen = writer->min_length;
12826 }
12827 writer->buffer = PyUnicode_New(newlen, maxchar);
12828 if (writer->buffer == NULL)
12829 return -1;
12830 _PyUnicodeWriter_Update(writer);
12831 return 0;
12832 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012833
Victor Stinnerd3f08822012-05-29 12:57:52 +020012834 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012835 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012836 /* overallocate 25% to limit the number of resize */
12837 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12838 newlen += newlen / 4;
12839 if (newlen < writer->min_length)
12840 newlen = writer->min_length;
12841 }
12842
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012843 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012844 /* resize + widen */
12845 newbuffer = PyUnicode_New(newlen, maxchar);
12846 if (newbuffer == NULL)
12847 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012848 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12849 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012850 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012851 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012852 }
12853 else {
12854 newbuffer = resize_compact(writer->buffer, newlen);
12855 if (newbuffer == NULL)
12856 return -1;
12857 }
12858 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012859 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012860 }
12861 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012862 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012863 newbuffer = PyUnicode_New(writer->size, maxchar);
12864 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012865 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012866 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12867 writer->buffer, 0, writer->pos);
12868 Py_DECREF(writer->buffer);
12869 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012870 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012871 }
12872 return 0;
12873}
12874
Victor Stinnerd3f08822012-05-29 12:57:52 +020012875int
12876_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12877{
12878 Py_UCS4 maxchar;
12879 Py_ssize_t len;
12880
12881 if (PyUnicode_READY(str) == -1)
12882 return -1;
12883 len = PyUnicode_GET_LENGTH(str);
12884 if (len == 0)
12885 return 0;
12886 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12887 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012888 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012889 Py_INCREF(str);
12890 writer->buffer = str;
12891 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012892 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012893 writer->size = 0;
12894 writer->pos += len;
12895 return 0;
12896 }
12897 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12898 return -1;
12899 }
12900 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12901 str, 0, len);
12902 writer->pos += len;
12903 return 0;
12904}
12905
12906PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012907_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012908{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012909 if (writer->pos == 0) {
12910 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012911 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012912 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012913 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012914 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12915 return writer->buffer;
12916 }
12917 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12918 PyObject *newbuffer;
12919 newbuffer = resize_compact(writer->buffer, writer->pos);
12920 if (newbuffer == NULL) {
12921 Py_DECREF(writer->buffer);
12922 return NULL;
12923 }
12924 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012925 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012926 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010012927 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012928}
12929
Victor Stinnerd3f08822012-05-29 12:57:52 +020012930void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012931_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012932{
12933 Py_CLEAR(writer->buffer);
12934}
12935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012937
12938PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012939 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012940\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012941Return a formatted version of S, using substitutions from args and kwargs.\n\
12942The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012943
Eric Smith27bbca62010-11-04 17:06:58 +000012944PyDoc_STRVAR(format_map__doc__,
12945 "S.format_map(mapping) -> str\n\
12946\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012947Return a formatted version of S, using substitutions from mapping.\n\
12948The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012949
Eric Smith4a7d76d2008-05-30 18:10:19 +000012950static PyObject *
12951unicode__format__(PyObject* self, PyObject* args)
12952{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012953 PyObject *format_spec;
12954 _PyUnicodeWriter writer;
12955 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012956
12957 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12958 return NULL;
12959
Victor Stinnerd3f08822012-05-29 12:57:52 +020012960 if (PyUnicode_READY(self) == -1)
12961 return NULL;
12962 _PyUnicodeWriter_Init(&writer, 0);
12963 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12964 self, format_spec, 0,
12965 PyUnicode_GET_LENGTH(format_spec));
12966 if (ret == -1) {
12967 _PyUnicodeWriter_Dealloc(&writer);
12968 return NULL;
12969 }
12970 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012971}
12972
Eric Smith8c663262007-08-25 02:26:07 +000012973PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012974 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012975\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012976Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012977
12978static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012979unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 Py_ssize_t size;
12982
12983 /* If it's a compact object, account for base structure +
12984 character data. */
12985 if (PyUnicode_IS_COMPACT_ASCII(v))
12986 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12987 else if (PyUnicode_IS_COMPACT(v))
12988 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012989 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 else {
12991 /* If it is a two-block object, account for base object, and
12992 for character block if present. */
12993 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012994 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012996 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 }
12998 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012999 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013000 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013002 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013003 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004
13005 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013006}
13007
13008PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013010
13011static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013012unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013013{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013014 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 if (!copy)
13016 return NULL;
13017 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013018}
13019
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013021 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013022 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013023 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13024 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013025 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13026 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013027 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013028 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13029 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13030 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13031 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13032 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013034 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13035 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13036 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013037 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013038 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13039 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13040 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013041 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013042 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013043 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013044 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013045 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13046 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13047 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13048 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13049 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13050 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13051 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13052 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13053 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13054 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13055 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13056 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13057 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13058 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013059 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013060 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013061 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013062 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013063 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013064 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013065 {"maketrans", (PyCFunction) unicode_maketrans,
13066 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013067 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013068#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013069 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013070 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071#endif
13072
Benjamin Peterson14339b62009-01-31 16:36:08 +000013073 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074 {NULL, NULL}
13075};
13076
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013077static PyObject *
13078unicode_mod(PyObject *v, PyObject *w)
13079{
Brian Curtindfc80e32011-08-10 20:28:54 -050013080 if (!PyUnicode_Check(v))
13081 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013083}
13084
13085static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013086 0, /*nb_add*/
13087 0, /*nb_subtract*/
13088 0, /*nb_multiply*/
13089 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013090};
13091
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 (lenfunc) unicode_length, /* sq_length */
13094 PyUnicode_Concat, /* sq_concat */
13095 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13096 (ssizeargfunc) unicode_getitem, /* sq_item */
13097 0, /* sq_slice */
13098 0, /* sq_ass_item */
13099 0, /* sq_ass_slice */
13100 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101};
13102
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013103static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013104unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 if (PyUnicode_READY(self) == -1)
13107 return NULL;
13108
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013109 if (PyIndex_Check(item)) {
13110 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013111 if (i == -1 && PyErr_Occurred())
13112 return NULL;
13113 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013115 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013116 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013117 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013118 PyObject *result;
13119 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013120 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013121 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013125 return NULL;
13126 }
13127
13128 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013129 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013131 slicelength == PyUnicode_GET_LENGTH(self)) {
13132 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013133 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013134 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013135 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013136 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013137 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013138 src_kind = PyUnicode_KIND(self);
13139 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013140 if (!PyUnicode_IS_ASCII(self)) {
13141 kind_limit = kind_maxchar_limit(src_kind);
13142 max_char = 0;
13143 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13144 ch = PyUnicode_READ(src_kind, src_data, cur);
13145 if (ch > max_char) {
13146 max_char = ch;
13147 if (max_char >= kind_limit)
13148 break;
13149 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013150 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013151 }
Victor Stinner55c99112011-10-13 01:17:06 +020013152 else
13153 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013154 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013155 if (result == NULL)
13156 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013158 dest_data = PyUnicode_DATA(result);
13159
13160 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013161 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13162 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013163 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013164 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013165 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013166 } else {
13167 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13168 return NULL;
13169 }
13170}
13171
13172static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 (lenfunc)unicode_length, /* mp_length */
13174 (binaryfunc)unicode_subscript, /* mp_subscript */
13175 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013176};
13177
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179/* Helpers for PyUnicode_Format() */
13180
13181static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013182getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013184 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 (*p_argidx)++;
13187 if (arglen < 0)
13188 return args;
13189 else
13190 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191 }
13192 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194 return NULL;
13195}
13196
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013197/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198
Victor Stinnerd3f08822012-05-29 12:57:52 +020013199static int
13200formatfloat(PyObject *v, int flags, int prec, int type,
13201 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013203 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013206
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 x = PyFloat_AsDouble(v);
13208 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013209 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013210
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013213
Eric Smith0923d1d2009-04-16 20:16:10 +000013214 p = PyOS_double_to_string(x, type, prec,
13215 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013216 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013217 return -1;
13218 len = strlen(p);
13219 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013220 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13221 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013222 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013223 }
Victor Stinner184252a2012-06-16 02:57:41 +020013224 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013225 writer->pos += len;
13226 }
13227 else
13228 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013229 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013230 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231}
13232
Victor Stinnerd0880d52012-04-27 23:40:13 +020013233/* formatlong() emulates the format codes d, u, o, x and X, and
13234 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13235 * Python's regular ints.
13236 * Return value: a new PyUnicodeObject*, or NULL if error.
13237 * The output string is of the form
13238 * "-"? ("0x" | "0X")? digit+
13239 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13240 * set in flags. The case of hex digits will be correct,
13241 * There will be at least prec digits, zero-filled on the left if
13242 * necessary to get that many.
13243 * val object to be converted
13244 * flags bitmask of format flags; only F_ALT is looked at
13245 * prec minimum number of digits; 0-fill on left if needed
13246 * type a character in [duoxX]; u acts the same as d
13247 *
13248 * CAUTION: o, x and X conversions on regular ints can never
13249 * produce a '-' sign, but can for Python's unbounded ints.
13250 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013251static PyObject*
13252formatlong(PyObject *val, int flags, int prec, int type)
13253{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013256 Py_ssize_t i;
13257 int sign; /* 1 if '-', else 0 */
13258 int len; /* number of characters */
13259 Py_ssize_t llen;
13260 int numdigits; /* len == numnondigits + numdigits */
13261 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013262
Victor Stinnerd0880d52012-04-27 23:40:13 +020013263 /* Avoid exceeding SSIZE_T_MAX */
13264 if (prec > INT_MAX-3) {
13265 PyErr_SetString(PyExc_OverflowError,
13266 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013268 }
13269
13270 assert(PyLong_Check(val));
13271
13272 switch (type) {
13273 case 'd':
13274 case 'u':
13275 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013276 if (PyBool_Check(val))
13277 result = PyNumber_ToBase(val, 10);
13278 else
13279 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013280 break;
13281 case 'o':
13282 numnondigits = 2;
13283 result = PyNumber_ToBase(val, 8);
13284 break;
13285 case 'x':
13286 case 'X':
13287 numnondigits = 2;
13288 result = PyNumber_ToBase(val, 16);
13289 break;
13290 default:
13291 assert(!"'type' not in [duoxX]");
13292 }
13293 if (!result)
13294 return NULL;
13295
13296 assert(unicode_modifiable(result));
13297 assert(PyUnicode_IS_READY(result));
13298 assert(PyUnicode_IS_ASCII(result));
13299
13300 /* To modify the string in-place, there can only be one reference. */
13301 if (Py_REFCNT(result) != 1) {
13302 PyErr_BadInternalCall();
13303 return NULL;
13304 }
13305 buf = PyUnicode_DATA(result);
13306 llen = PyUnicode_GET_LENGTH(result);
13307 if (llen > INT_MAX) {
13308 PyErr_SetString(PyExc_ValueError,
13309 "string too large in _PyBytes_FormatLong");
13310 return NULL;
13311 }
13312 len = (int)llen;
13313 sign = buf[0] == '-';
13314 numnondigits += sign;
13315 numdigits = len - numnondigits;
13316 assert(numdigits > 0);
13317
13318 /* Get rid of base marker unless F_ALT */
13319 if (((flags & F_ALT) == 0 &&
13320 (type == 'o' || type == 'x' || type == 'X'))) {
13321 assert(buf[sign] == '0');
13322 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13323 buf[sign+1] == 'o');
13324 numnondigits -= 2;
13325 buf += 2;
13326 len -= 2;
13327 if (sign)
13328 buf[0] = '-';
13329 assert(len == numnondigits + numdigits);
13330 assert(numdigits > 0);
13331 }
13332
13333 /* Fill with leading zeroes to meet minimum width. */
13334 if (prec > numdigits) {
13335 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13336 numnondigits + prec);
13337 char *b1;
13338 if (!r1) {
13339 Py_DECREF(result);
13340 return NULL;
13341 }
13342 b1 = PyBytes_AS_STRING(r1);
13343 for (i = 0; i < numnondigits; ++i)
13344 *b1++ = *buf++;
13345 for (i = 0; i < prec - numdigits; i++)
13346 *b1++ = '0';
13347 for (i = 0; i < numdigits; i++)
13348 *b1++ = *buf++;
13349 *b1 = '\0';
13350 Py_DECREF(result);
13351 result = r1;
13352 buf = PyBytes_AS_STRING(result);
13353 len = numnondigits + prec;
13354 }
13355
13356 /* Fix up case for hex conversions. */
13357 if (type == 'X') {
13358 /* Need to convert all lower case letters to upper case.
13359 and need to convert 0x to 0X (and -0x to -0X). */
13360 for (i = 0; i < len; i++)
13361 if (buf[i] >= 'a' && buf[i] <= 'x')
13362 buf[i] -= 'a'-'A';
13363 }
13364 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13365 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013367 Py_DECREF(result);
13368 result = unicode;
13369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013370 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013371}
13372
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013373static Py_UCS4
13374formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013376 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013377 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013379 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 goto onError;
13382 }
13383 else {
13384 /* Integer input truncated to a character */
13385 long x;
13386 x = PyLong_AsLong(v);
13387 if (x == -1 && PyErr_Occurred())
13388 goto onError;
13389
Victor Stinner8faf8212011-12-08 22:14:11 +010013390 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 PyErr_SetString(PyExc_OverflowError,
13392 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013393 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 }
13395
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013398
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013400 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013402 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403}
13404
Alexander Belopolsky40018472011-02-26 01:02:56 +000013405PyObject *
13406PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013408 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 PyObject *temp = NULL;
13412 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013413 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013414 void *fmt;
13415 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013416 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013417 Py_ssize_t sublen;
13418 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013419
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 PyErr_BadInternalCall();
13422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013424 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013425 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013427 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013428 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013429 return NULL;
13430 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013432 fmt = PyUnicode_DATA(uformat);
13433 fmtkind = PyUnicode_KIND(uformat);
13434 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13435 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436
Victor Stinnerd3f08822012-05-29 12:57:52 +020013437 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013438
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 arglen = PyTuple_Size(args);
13441 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442 }
13443 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 arglen = -1;
13445 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013447 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449
13450 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013452 Py_ssize_t nonfmtpos;
13453 nonfmtpos = fmtpos++;
13454 while (fmtcnt >= 0 &&
13455 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13456 fmtpos++;
13457 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013458 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013459 if (fmtcnt < 0)
13460 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013461 sublen = fmtpos - nonfmtpos;
13462 maxchar = _PyUnicode_FindMaxChar(uformat,
13463 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013464 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013465 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013466
Victor Stinnerd3f08822012-05-29 12:57:52 +020013467 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13468 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013469 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013470 }
13471 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 /* Got a format specifier */
13473 int flags = 0;
13474 Py_ssize_t width = -1;
13475 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013476 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013477 Py_UCS4 fill;
13478 int sign;
13479 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 int isnumok;
13481 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013482 void *pbuf = NULL;
13483 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013484 Py_UCS4 bufmaxchar;
13485 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013488 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13489 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013491 Py_ssize_t keylen;
13492 PyObject *key;
13493 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013494
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 if (dict == NULL) {
13496 PyErr_SetString(PyExc_TypeError,
13497 "format requires a mapping");
13498 goto onError;
13499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 /* Skip over balanced parentheses */
13504 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013505 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13506 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013508 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013510 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013512 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 if (fmtcnt < 0 || pcount > 0) {
13514 PyErr_SetString(PyExc_ValueError,
13515 "incomplete format key");
13516 goto onError;
13517 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013518 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013519 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 if (key == NULL)
13521 goto onError;
13522 if (args_owned) {
13523 Py_DECREF(args);
13524 args_owned = 0;
13525 }
13526 args = PyObject_GetItem(dict, key);
13527 Py_DECREF(key);
13528 if (args == NULL) {
13529 goto onError;
13530 }
13531 args_owned = 1;
13532 arglen = -1;
13533 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013535 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013536 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13537 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 case '-': flags |= F_LJUST; continue;
13539 case '+': flags |= F_SIGN; continue;
13540 case ' ': flags |= F_BLANK; continue;
13541 case '#': flags |= F_ALT; continue;
13542 case '0': flags |= F_ZERO; continue;
13543 }
13544 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013545 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 if (c == '*') {
13547 v = getnextarg(args, arglen, &argidx);
13548 if (v == NULL)
13549 goto onError;
13550 if (!PyLong_Check(v)) {
13551 PyErr_SetString(PyExc_TypeError,
13552 "* wants int");
13553 goto onError;
13554 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013555 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 if (width == -1 && PyErr_Occurred())
13557 goto onError;
13558 if (width < 0) {
13559 flags |= F_LJUST;
13560 width = -width;
13561 }
13562 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 }
13565 else if (c >= '0' && c <= '9') {
13566 width = c - '0';
13567 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013568 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 if (c < '0' || c > '9')
13570 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013571 /* Since c is unsigned, the RHS would end up as unsigned,
13572 mixing signed and unsigned comparison. Since c is between
13573 '0' and '9', casting to int is safe. */
13574 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 PyErr_SetString(PyExc_ValueError,
13576 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013577 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 }
13579 width = width*10 + (c - '0');
13580 }
13581 }
13582 if (c == '.') {
13583 prec = 0;
13584 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013585 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 if (c == '*') {
13587 v = getnextarg(args, arglen, &argidx);
13588 if (v == NULL)
13589 goto onError;
13590 if (!PyLong_Check(v)) {
13591 PyErr_SetString(PyExc_TypeError,
13592 "* wants int");
13593 goto onError;
13594 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013595 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 if (prec == -1 && PyErr_Occurred())
13597 goto onError;
13598 if (prec < 0)
13599 prec = 0;
13600 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 }
13603 else if (c >= '0' && c <= '9') {
13604 prec = c - '0';
13605 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 if (c < '0' || c > '9')
13608 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013609 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 PyErr_SetString(PyExc_ValueError,
13611 "prec too big");
13612 goto onError;
13613 }
13614 prec = prec*10 + (c - '0');
13615 }
13616 }
13617 } /* prec */
13618 if (fmtcnt >= 0) {
13619 if (c == 'h' || c == 'l' || c == 'L') {
13620 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013621 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 }
13623 }
13624 if (fmtcnt < 0) {
13625 PyErr_SetString(PyExc_ValueError,
13626 "incomplete format");
13627 goto onError;
13628 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013629 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013630 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013631
13632 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013633 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013634 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013635 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13636 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013637 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013639
Victor Stinneraff3cc62012-04-30 05:19:21 +020013640 v = getnextarg(args, arglen, &argidx);
13641 if (v == NULL)
13642 goto onError;
13643
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013645 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 fill = ' ';
13647 switch (c) {
13648
Benjamin Peterson29060642009-01-31 22:14:21 +000013649 case 's':
13650 case 'r':
13651 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13653 /* Fast path */
13654 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13655 goto onError;
13656 goto nextarg;
13657 }
13658
Victor Stinner808fc0a2010-03-22 12:50:40 +000013659 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 temp = v;
13661 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013662 }
13663 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 if (c == 's')
13665 temp = PyObject_Str(v);
13666 else if (c == 'r')
13667 temp = PyObject_Repr(v);
13668 else
13669 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 break;
13672
13673 case 'i':
13674 case 'd':
13675 case 'u':
13676 case 'o':
13677 case 'x':
13678 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013679 if (PyLong_CheckExact(v)
13680 && width == -1 && prec == -1
13681 && !(flags & (F_SIGN | F_BLANK)))
13682 {
13683 /* Fast path */
13684 switch(c)
13685 {
13686 case 'd':
13687 case 'i':
13688 case 'u':
13689 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13690 goto onError;
13691 goto nextarg;
13692 case 'x':
13693 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13694 goto onError;
13695 goto nextarg;
13696 case 'o':
13697 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13698 goto onError;
13699 goto nextarg;
13700 default:
13701 break;
13702 }
13703 }
13704
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 isnumok = 0;
13706 if (PyNumber_Check(v)) {
13707 PyObject *iobj=NULL;
13708
13709 if (PyLong_Check(v)) {
13710 iobj = v;
13711 Py_INCREF(iobj);
13712 }
13713 else {
13714 iobj = PyNumber_Long(v);
13715 }
13716 if (iobj!=NULL) {
13717 if (PyLong_Check(iobj)) {
13718 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013719 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013720 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013722 }
13723 else {
13724 Py_DECREF(iobj);
13725 }
13726 }
13727 }
13728 if (!isnumok) {
13729 PyErr_Format(PyExc_TypeError,
13730 "%%%c format: a number is required, "
13731 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13732 goto onError;
13733 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013734 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013735 fill = '0';
13736 break;
13737
13738 case 'e':
13739 case 'E':
13740 case 'f':
13741 case 'F':
13742 case 'g':
13743 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013744 if (width == -1 && prec == -1
13745 && !(flags & (F_SIGN | F_BLANK)))
13746 {
13747 /* Fast path */
13748 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13749 goto onError;
13750 goto nextarg;
13751 }
13752
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013754 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013756 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13757 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 break;
13759
13760 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013761 {
13762 Py_UCS4 ch = formatchar(v);
13763 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013764 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013765 if (width == -1 && prec == -1) {
13766 /* Fast path */
13767 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13768 goto onError;
13769 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13770 writer.pos += 1;
13771 goto nextarg;
13772 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013773 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013775 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013776
13777 default:
13778 PyErr_Format(PyExc_ValueError,
13779 "unsupported format character '%c' (0x%x) "
13780 "at index %zd",
13781 (31<=c && c<=126) ? (char)c : '?',
13782 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013784 goto onError;
13785 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013786 if (temp == NULL)
13787 goto onError;
13788 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013789
13790 if (width == -1 && prec == -1
13791 && !(flags & (F_SIGN | F_BLANK)))
13792 {
13793 /* Fast path */
13794 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13795 goto onError;
13796 goto nextarg;
13797 }
13798
Victor Stinneraff3cc62012-04-30 05:19:21 +020013799 if (PyUnicode_READY(temp) == -1) {
13800 Py_CLEAR(temp);
13801 goto onError;
13802 }
13803 kind = PyUnicode_KIND(temp);
13804 pbuf = PyUnicode_DATA(temp);
13805 len = PyUnicode_GET_LENGTH(temp);
13806
13807 if (c == 's' || c == 'r' || c == 'a') {
13808 if (prec >= 0 && len > prec)
13809 len = prec;
13810 }
13811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 /* pbuf is initialized here. */
13813 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013815 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13816 if (ch == '-' || ch == '+') {
13817 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013818 len--;
13819 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 }
13821 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013822 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013824 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 else
13826 sign = 0;
13827 }
13828 if (width < len)
13829 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013830
13831 /* Compute the length and maximum character of the
13832 written characters */
13833 bufmaxchar = 127;
13834 if (!(flags & F_LJUST)) {
13835 if (sign) {
13836 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013838 }
13839 else {
13840 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013842 }
13843 }
13844 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013846
13847 buflen = width;
13848 if (sign && len == width)
13849 buflen++;
13850
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013851 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013852 goto onError;
13853
13854 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013856 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013857 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13858 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013859 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 if (width > len)
13861 width--;
13862 }
13863 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013865 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013867 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13868 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13869 writer.pos += 2;
13870 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 width -= 2;
13873 if (width < 0)
13874 width = 0;
13875 len -= 2;
13876 }
13877 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013878 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013879 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13880 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013881 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 }
13883 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013884 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013885 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13886 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013889 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13890 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013891 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13892 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13893 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013894 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013895 }
13896 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013897
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013898 if (len) {
13899 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13900 temp, pindex, len);
13901 writer.pos += len;
13902 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013903 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013904 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013905 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13906 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013907 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013908
Victor Stinnerd3f08822012-05-29 12:57:52 +020013909nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013910 if (dict && (argidx < arglen) && c != '%') {
13911 PyErr_SetString(PyExc_TypeError,
13912 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 goto onError;
13914 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013915 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013916 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013917 } /* until end */
13918 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 PyErr_SetString(PyExc_TypeError,
13920 "not all arguments converted during string formatting");
13921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 }
13923
13924 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926 }
13927 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013928 Py_XDECREF(temp);
13929 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013930 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013934 Py_XDECREF(temp);
13935 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013936 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013938 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939 }
13940 return NULL;
13941}
13942
Jeremy Hylton938ace62002-07-17 16:30:39 +000013943static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13945
Tim Peters6d6c1a32001-08-02 04:15:00 +000013946static PyObject *
13947unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13948{
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 static char *kwlist[] = {"object", "encoding", "errors", 0};
13951 char *encoding = NULL;
13952 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013953
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 if (type != &PyUnicode_Type)
13955 return unicode_subtype_new(type, args, kwds);
13956 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 return NULL;
13959 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020013960 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000013961 if (encoding == NULL && errors == NULL)
13962 return PyObject_Str(x);
13963 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013964 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013965}
13966
Guido van Rossume023fe02001-08-30 03:12:59 +000013967static PyObject *
13968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013970 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013971 Py_ssize_t length, char_size;
13972 int share_wstr, share_utf8;
13973 unsigned int kind;
13974 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013975
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013977
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013978 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013979 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013981 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013982 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013983 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013984 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013985 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013986
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013987 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013988 if (self == NULL) {
13989 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 return NULL;
13991 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013992 kind = PyUnicode_KIND(unicode);
13993 length = PyUnicode_GET_LENGTH(unicode);
13994
13995 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013996#ifdef Py_DEBUG
13997 _PyUnicode_HASH(self) = -1;
13998#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013999 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014000#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014001 _PyUnicode_STATE(self).interned = 0;
14002 _PyUnicode_STATE(self).kind = kind;
14003 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014004 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014005 _PyUnicode_STATE(self).ready = 1;
14006 _PyUnicode_WSTR(self) = NULL;
14007 _PyUnicode_UTF8_LENGTH(self) = 0;
14008 _PyUnicode_UTF8(self) = NULL;
14009 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014010 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014011
14012 share_utf8 = 0;
14013 share_wstr = 0;
14014 if (kind == PyUnicode_1BYTE_KIND) {
14015 char_size = 1;
14016 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14017 share_utf8 = 1;
14018 }
14019 else if (kind == PyUnicode_2BYTE_KIND) {
14020 char_size = 2;
14021 if (sizeof(wchar_t) == 2)
14022 share_wstr = 1;
14023 }
14024 else {
14025 assert(kind == PyUnicode_4BYTE_KIND);
14026 char_size = 4;
14027 if (sizeof(wchar_t) == 4)
14028 share_wstr = 1;
14029 }
14030
14031 /* Ensure we won't overflow the length. */
14032 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14033 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014035 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014036 data = PyObject_MALLOC((length + 1) * char_size);
14037 if (data == NULL) {
14038 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014039 goto onError;
14040 }
14041
Victor Stinnerc3c74152011-10-02 20:39:55 +020014042 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043 if (share_utf8) {
14044 _PyUnicode_UTF8_LENGTH(self) = length;
14045 _PyUnicode_UTF8(self) = data;
14046 }
14047 if (share_wstr) {
14048 _PyUnicode_WSTR_LENGTH(self) = length;
14049 _PyUnicode_WSTR(self) = (wchar_t *)data;
14050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014051
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014053 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014054 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014055#ifdef Py_DEBUG
14056 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14057#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014058 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014059 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014060
14061onError:
14062 Py_DECREF(unicode);
14063 Py_DECREF(self);
14064 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014065}
14066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014067PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014068"str(object='') -> str\n\
14069str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014070\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014071Create a new string object from the given object. If encoding or\n\
14072errors is specified, then the object must expose a data buffer\n\
14073that will be decoded using the given encoding and error handler.\n\
14074Otherwise, returns the result of object.__str__() (if defined)\n\
14075or repr(object).\n\
14076encoding defaults to sys.getdefaultencoding().\n\
14077errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014078
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014079static PyObject *unicode_iter(PyObject *seq);
14080
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014082 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 "str", /* tp_name */
14084 sizeof(PyUnicodeObject), /* tp_size */
14085 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014086 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 (destructor)unicode_dealloc, /* tp_dealloc */
14088 0, /* tp_print */
14089 0, /* tp_getattr */
14090 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014091 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 unicode_repr, /* tp_repr */
14093 &unicode_as_number, /* tp_as_number */
14094 &unicode_as_sequence, /* tp_as_sequence */
14095 &unicode_as_mapping, /* tp_as_mapping */
14096 (hashfunc) unicode_hash, /* tp_hash*/
14097 0, /* tp_call*/
14098 (reprfunc) unicode_str, /* tp_str */
14099 PyObject_GenericGetAttr, /* tp_getattro */
14100 0, /* tp_setattro */
14101 0, /* tp_as_buffer */
14102 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 unicode_doc, /* tp_doc */
14105 0, /* tp_traverse */
14106 0, /* tp_clear */
14107 PyUnicode_RichCompare, /* tp_richcompare */
14108 0, /* tp_weaklistoffset */
14109 unicode_iter, /* tp_iter */
14110 0, /* tp_iternext */
14111 unicode_methods, /* tp_methods */
14112 0, /* tp_members */
14113 0, /* tp_getset */
14114 &PyBaseObject_Type, /* tp_base */
14115 0, /* tp_dict */
14116 0, /* tp_descr_get */
14117 0, /* tp_descr_set */
14118 0, /* tp_dictoffset */
14119 0, /* tp_init */
14120 0, /* tp_alloc */
14121 unicode_new, /* tp_new */
14122 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123};
14124
14125/* Initialize the Unicode implementation */
14126
Victor Stinner3a50e702011-10-18 21:21:00 +020014127int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014129 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014130 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014131 0x000A, /* LINE FEED */
14132 0x000D, /* CARRIAGE RETURN */
14133 0x001C, /* FILE SEPARATOR */
14134 0x001D, /* GROUP SEPARATOR */
14135 0x001E, /* RECORD SEPARATOR */
14136 0x0085, /* NEXT LINE */
14137 0x2028, /* LINE SEPARATOR */
14138 0x2029, /* PARAGRAPH SEPARATOR */
14139 };
14140
Fred Drakee4315f52000-05-09 19:53:39 +000014141 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014142 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014143 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014144 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014145 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014146
Guido van Rossumcacfc072002-05-24 19:01:59 +000014147 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014148 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014149
14150 /* initialize the linebreak bloom filter */
14151 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014152 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014153 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014154
14155 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014156
Benjamin Petersonc4311282012-10-30 23:21:10 -040014157 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14158 Py_FatalError("Can't initialize field name iterator type");
14159
14160 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14161 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014162
Victor Stinner3a50e702011-10-18 21:21:00 +020014163#ifdef HAVE_MBCS
14164 winver.dwOSVersionInfoSize = sizeof(winver);
14165 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14166 PyErr_SetFromWindowsErr(0);
14167 return -1;
14168 }
14169#endif
14170 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014171}
14172
14173/* Finalize the Unicode implementation */
14174
Christian Heimesa156e092008-02-16 07:38:31 +000014175int
14176PyUnicode_ClearFreeList(void)
14177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014178 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014179}
14180
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181void
Thomas Wouters78890102000-07-22 19:25:51 +000014182_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014183{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014184 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185
Serhiy Storchaka05997252013-01-26 12:14:02 +020014186 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014187
Serhiy Storchaka05997252013-01-26 12:14:02 +020014188 for (i = 0; i < 256; i++)
14189 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014190 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014191 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014193
Walter Dörwald16807132007-05-25 13:52:07 +000014194void
14195PyUnicode_InternInPlace(PyObject **p)
14196{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014197 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014198 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014199#ifdef Py_DEBUG
14200 assert(s != NULL);
14201 assert(_PyUnicode_CHECK(s));
14202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014203 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014204 return;
14205#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014206 /* If it's a subclass, we don't really know what putting
14207 it in the interned dict might do. */
14208 if (!PyUnicode_CheckExact(s))
14209 return;
14210 if (PyUnicode_CHECK_INTERNED(s))
14211 return;
14212 if (interned == NULL) {
14213 interned = PyDict_New();
14214 if (interned == NULL) {
14215 PyErr_Clear(); /* Don't leave an exception */
14216 return;
14217 }
14218 }
14219 /* It might be that the GetItem call fails even
14220 though the key is present in the dictionary,
14221 namely when this happens during a stack overflow. */
14222 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014223 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014224 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014225
Benjamin Peterson29060642009-01-31 22:14:21 +000014226 if (t) {
14227 Py_INCREF(t);
14228 Py_DECREF(*p);
14229 *p = t;
14230 return;
14231 }
Walter Dörwald16807132007-05-25 13:52:07 +000014232
Benjamin Peterson14339b62009-01-31 16:36:08 +000014233 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014234 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014235 PyErr_Clear();
14236 PyThreadState_GET()->recursion_critical = 0;
14237 return;
14238 }
14239 PyThreadState_GET()->recursion_critical = 0;
14240 /* The two references in interned are not counted by refcnt.
14241 The deallocator will take care of this */
14242 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014243 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014244}
14245
14246void
14247PyUnicode_InternImmortal(PyObject **p)
14248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 PyUnicode_InternInPlace(p);
14250 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014251 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 Py_INCREF(*p);
14253 }
Walter Dörwald16807132007-05-25 13:52:07 +000014254}
14255
14256PyObject *
14257PyUnicode_InternFromString(const char *cp)
14258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014259 PyObject *s = PyUnicode_FromString(cp);
14260 if (s == NULL)
14261 return NULL;
14262 PyUnicode_InternInPlace(&s);
14263 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014264}
14265
Alexander Belopolsky40018472011-02-26 01:02:56 +000014266void
14267_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014268{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014270 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 Py_ssize_t i, n;
14272 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014273
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 if (interned == NULL || !PyDict_Check(interned))
14275 return;
14276 keys = PyDict_Keys(interned);
14277 if (keys == NULL || !PyList_Check(keys)) {
14278 PyErr_Clear();
14279 return;
14280 }
Walter Dörwald16807132007-05-25 13:52:07 +000014281
Benjamin Peterson14339b62009-01-31 16:36:08 +000014282 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14283 detector, interned unicode strings are not forcibly deallocated;
14284 rather, we give them their stolen references back, and then clear
14285 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014286
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 n = PyList_GET_SIZE(keys);
14288 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014289 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014291 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014292 if (PyUnicode_READY(s) == -1) {
14293 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014294 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014296 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014297 case SSTATE_NOT_INTERNED:
14298 /* XXX Shouldn't happen */
14299 break;
14300 case SSTATE_INTERNED_IMMORTAL:
14301 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014302 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 break;
14304 case SSTATE_INTERNED_MORTAL:
14305 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014306 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 break;
14308 default:
14309 Py_FatalError("Inconsistent interned string state.");
14310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014311 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014312 }
14313 fprintf(stderr, "total size of all interned strings: "
14314 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14315 "mortal/immortal\n", mortal_size, immortal_size);
14316 Py_DECREF(keys);
14317 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014318 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014319}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014320
14321
14322/********************* Unicode Iterator **************************/
14323
14324typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014325 PyObject_HEAD
14326 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014327 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014328} unicodeiterobject;
14329
14330static void
14331unicodeiter_dealloc(unicodeiterobject *it)
14332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014333 _PyObject_GC_UNTRACK(it);
14334 Py_XDECREF(it->it_seq);
14335 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014336}
14337
14338static int
14339unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14340{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 Py_VISIT(it->it_seq);
14342 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014343}
14344
14345static PyObject *
14346unicodeiter_next(unicodeiterobject *it)
14347{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014348 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014349
Benjamin Peterson14339b62009-01-31 16:36:08 +000014350 assert(it != NULL);
14351 seq = it->it_seq;
14352 if (seq == NULL)
14353 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014354 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14357 int kind = PyUnicode_KIND(seq);
14358 void *data = PyUnicode_DATA(seq);
14359 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14360 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 if (item != NULL)
14362 ++it->it_index;
14363 return item;
14364 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014365
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 Py_DECREF(seq);
14367 it->it_seq = NULL;
14368 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014369}
14370
14371static PyObject *
14372unicodeiter_len(unicodeiterobject *it)
14373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 Py_ssize_t len = 0;
14375 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014376 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014378}
14379
14380PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14381
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014382static PyObject *
14383unicodeiter_reduce(unicodeiterobject *it)
14384{
14385 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014386 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014387 it->it_seq, it->it_index);
14388 } else {
14389 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14390 if (u == NULL)
14391 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014392 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014393 }
14394}
14395
14396PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14397
14398static PyObject *
14399unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14400{
14401 Py_ssize_t index = PyLong_AsSsize_t(state);
14402 if (index == -1 && PyErr_Occurred())
14403 return NULL;
14404 if (index < 0)
14405 index = 0;
14406 it->it_index = index;
14407 Py_RETURN_NONE;
14408}
14409
14410PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14411
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014412static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014415 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14416 reduce_doc},
14417 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14418 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420};
14421
14422PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14424 "str_iterator", /* tp_name */
14425 sizeof(unicodeiterobject), /* tp_basicsize */
14426 0, /* tp_itemsize */
14427 /* methods */
14428 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14429 0, /* tp_print */
14430 0, /* tp_getattr */
14431 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014432 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 0, /* tp_repr */
14434 0, /* tp_as_number */
14435 0, /* tp_as_sequence */
14436 0, /* tp_as_mapping */
14437 0, /* tp_hash */
14438 0, /* tp_call */
14439 0, /* tp_str */
14440 PyObject_GenericGetAttr, /* tp_getattro */
14441 0, /* tp_setattro */
14442 0, /* tp_as_buffer */
14443 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14444 0, /* tp_doc */
14445 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14446 0, /* tp_clear */
14447 0, /* tp_richcompare */
14448 0, /* tp_weaklistoffset */
14449 PyObject_SelfIter, /* tp_iter */
14450 (iternextfunc)unicodeiter_next, /* tp_iternext */
14451 unicodeiter_methods, /* tp_methods */
14452 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014453};
14454
14455static PyObject *
14456unicode_iter(PyObject *seq)
14457{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014458 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014459
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 if (!PyUnicode_Check(seq)) {
14461 PyErr_BadInternalCall();
14462 return NULL;
14463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014464 if (PyUnicode_READY(seq) == -1)
14465 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14467 if (it == NULL)
14468 return NULL;
14469 it->it_index = 0;
14470 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014471 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014472 _PyObject_GC_TRACK(it);
14473 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014474}
14475
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014476
14477size_t
14478Py_UNICODE_strlen(const Py_UNICODE *u)
14479{
14480 int res = 0;
14481 while(*u++)
14482 res++;
14483 return res;
14484}
14485
14486Py_UNICODE*
14487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14488{
14489 Py_UNICODE *u = s1;
14490 while ((*u++ = *s2++));
14491 return s1;
14492}
14493
14494Py_UNICODE*
14495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14496{
14497 Py_UNICODE *u = s1;
14498 while ((*u++ = *s2++))
14499 if (n-- == 0)
14500 break;
14501 return s1;
14502}
14503
14504Py_UNICODE*
14505Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14506{
14507 Py_UNICODE *u1 = s1;
14508 u1 += Py_UNICODE_strlen(u1);
14509 Py_UNICODE_strcpy(u1, s2);
14510 return s1;
14511}
14512
14513int
14514Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14515{
14516 while (*s1 && *s2 && *s1 == *s2)
14517 s1++, s2++;
14518 if (*s1 && *s2)
14519 return (*s1 < *s2) ? -1 : +1;
14520 if (*s1)
14521 return 1;
14522 if (*s2)
14523 return -1;
14524 return 0;
14525}
14526
14527int
14528Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14529{
14530 register Py_UNICODE u1, u2;
14531 for (; n != 0; n--) {
14532 u1 = *s1;
14533 u2 = *s2;
14534 if (u1 != u2)
14535 return (u1 < u2) ? -1 : +1;
14536 if (u1 == '\0')
14537 return 0;
14538 s1++;
14539 s2++;
14540 }
14541 return 0;
14542}
14543
14544Py_UNICODE*
14545Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14546{
14547 const Py_UNICODE *p;
14548 for (p = s; *p; p++)
14549 if (*p == c)
14550 return (Py_UNICODE*)p;
14551 return NULL;
14552}
14553
14554Py_UNICODE*
14555Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14556{
14557 const Py_UNICODE *p;
14558 p = s + Py_UNICODE_strlen(s);
14559 while (p != s) {
14560 p--;
14561 if (*p == c)
14562 return (Py_UNICODE*)p;
14563 }
14564 return NULL;
14565}
Victor Stinner331ea922010-08-10 16:37:20 +000014566
Victor Stinner71133ff2010-09-01 23:43:53 +000014567Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014568PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014569{
Victor Stinner577db2c2011-10-11 22:12:48 +020014570 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014571 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014573 if (!PyUnicode_Check(unicode)) {
14574 PyErr_BadArgument();
14575 return NULL;
14576 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014577 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014578 if (u == NULL)
14579 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014580 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014581 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014582 PyErr_NoMemory();
14583 return NULL;
14584 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014585 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014586 size *= sizeof(Py_UNICODE);
14587 copy = PyMem_Malloc(size);
14588 if (copy == NULL) {
14589 PyErr_NoMemory();
14590 return NULL;
14591 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014592 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014593 return copy;
14594}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014595
Georg Brandl66c221e2010-10-14 07:04:07 +000014596/* A _string module, to export formatter_parser and formatter_field_name_split
14597 to the string.Formatter class implemented in Python. */
14598
14599static PyMethodDef _string_methods[] = {
14600 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14601 METH_O, PyDoc_STR("split the argument as a field name")},
14602 {"formatter_parser", (PyCFunction) formatter_parser,
14603 METH_O, PyDoc_STR("parse the argument as a format string")},
14604 {NULL, NULL}
14605};
14606
14607static struct PyModuleDef _string_module = {
14608 PyModuleDef_HEAD_INIT,
14609 "_string",
14610 PyDoc_STR("string helper module"),
14611 0,
14612 _string_methods,
14613 NULL,
14614 NULL,
14615 NULL,
14616 NULL
14617};
14618
14619PyMODINIT_FUNC
14620PyInit__string(void)
14621{
14622 return PyModule_Create(&_string_module);
14623}
14624
14625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014626#ifdef __cplusplus
14627}
14628#endif