blob: 23b8cc764d44de4e7cf70c60606cf1f7caa8d248 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
275static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
321 if (strcmp(errors, "surrogateescape") == 0)
322 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner50149202015-09-22 00:26:54 +0200323 if (strcmp(errors, "replace") == 0)
324 return _Py_ERROR_REPLACE;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200325 if (strcmp(errors, "ignore") == 0)
326 return _Py_ERROR_IGNORE;
327 if (strcmp(errors, "backslashreplace") == 0)
328 return _Py_ERROR_BACKSLASHREPLACE;
329 if (strcmp(errors, "surrogatepass") == 0)
330 return _Py_ERROR_SURROGATEPASS;
Victor Stinner50149202015-09-22 00:26:54 +0200331 if (strcmp(errors, "xmlcharrefreplace") == 0)
332 return _Py_ERROR_XMLCHARREFREPLACE;
333 return _Py_ERROR_OTHER;
334}
335
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300336/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
337 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000338Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000339PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000340{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000341#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000343#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 /* This is actually an illegal character, so it should
345 not be passed to unichr. */
346 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347#endif
348}
349
Victor Stinner910337b2011-10-03 03:20:16 +0200350#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200351int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100352_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200353{
354 PyASCIIObject *ascii;
355 unsigned int kind;
356
357 assert(PyUnicode_Check(op));
358
359 ascii = (PyASCIIObject *)op;
360 kind = ascii->state.kind;
361
Victor Stinnera3b334d2011-10-03 13:53:37 +0200362 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200363 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200364 assert(ascii->state.ready == 1);
365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200367 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200369
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 if (ascii->state.compact == 1) {
371 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200372 assert(kind == PyUnicode_1BYTE_KIND
373 || kind == PyUnicode_2BYTE_KIND
374 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100378 }
379 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
381
382 data = unicode->data.any;
383 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100384 assert(ascii->length == 0);
385 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 assert(ascii->state.compact == 0);
387 assert(ascii->state.ascii == 0);
388 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100389 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 assert(ascii->wstr != NULL);
391 assert(data == NULL);
392 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 }
394 else {
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ready == 1);
400 assert(data != NULL);
401 if (ascii->state.ascii) {
402 assert (compact->utf8 == data);
403 assert (compact->utf8_length == ascii->length);
404 }
405 else
406 assert (compact->utf8 != data);
407 }
408 }
409 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200410 if (
411#if SIZEOF_WCHAR_T == 2
412 kind == PyUnicode_2BYTE_KIND
413#else
414 kind == PyUnicode_4BYTE_KIND
415#endif
416 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 {
418 assert(ascii->wstr == data);
419 assert(compact->wstr_length == ascii->length);
420 } else
421 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423
424 if (compact->utf8 == NULL)
425 assert(compact->utf8_length == 0);
426 if (ascii->wstr == NULL)
427 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200429 /* check that the best kind is used */
430 if (check_content && kind != PyUnicode_WCHAR_KIND)
431 {
432 Py_ssize_t i;
433 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200434 void *data;
435 Py_UCS4 ch;
436
437 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200438 for (i=0; i < ascii->length; i++)
439 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200440 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 if (ch > maxchar)
442 maxchar = ch;
443 }
444 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100445 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200446 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100447 assert(maxchar <= 255);
448 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200449 else
450 assert(maxchar < 128);
451 }
Victor Stinner77faf692011-11-20 18:56:05 +0100452 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 0xFFFF);
455 }
456 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100458 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200460 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400462 return 1;
463}
Victor Stinner910337b2011-10-03 03:20:16 +0200464#endif
465
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466static PyObject*
467unicode_result_wchar(PyObject *unicode)
468{
469#ifndef Py_DEBUG
470 Py_ssize_t len;
471
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 len = _PyUnicode_WSTR_LENGTH(unicode);
473 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200475 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 }
477
478 if (len == 1) {
479 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100480 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
482 Py_DECREF(unicode);
483 return latin1_char;
484 }
485 }
486
487 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200488 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 return NULL;
490 }
491#else
Victor Stinneraa771272012-10-04 02:32:58 +0200492 assert(Py_REFCNT(unicode) == 1);
493
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 /* don't make the result ready in debug mode to ensure that the caller
495 makes the string ready before using it */
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497#endif
498 return unicode;
499}
500
501static PyObject*
502unicode_result_ready(PyObject *unicode)
503{
504 Py_ssize_t length;
505
506 length = PyUnicode_GET_LENGTH(unicode);
507 if (length == 0) {
508 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100509 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 }
512 return unicode_empty;
513 }
514
515 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200516 void *data = PyUnicode_DATA(unicode);
517 int kind = PyUnicode_KIND(unicode);
518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519 if (ch < 256) {
520 PyObject *latin1_char = unicode_latin1[ch];
521 if (latin1_char != NULL) {
522 if (unicode != latin1_char) {
523 Py_INCREF(latin1_char);
524 Py_DECREF(unicode);
525 }
526 return latin1_char;
527 }
528 else {
529 assert(_PyUnicode_CheckConsistency(unicode, 1));
530 Py_INCREF(unicode);
531 unicode_latin1[ch] = unicode;
532 return unicode;
533 }
534 }
535 }
536
537 assert(_PyUnicode_CheckConsistency(unicode, 1));
538 return unicode;
539}
540
541static PyObject*
542unicode_result(PyObject *unicode)
543{
544 assert(_PyUnicode_CHECK(unicode));
545 if (PyUnicode_IS_READY(unicode))
546 return unicode_result_ready(unicode);
547 else
548 return unicode_result_wchar(unicode);
549}
550
Victor Stinnerc4b49542011-12-11 22:44:26 +0100551static PyObject*
552unicode_result_unchanged(PyObject *unicode)
553{
554 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500555 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100556 return NULL;
557 Py_INCREF(unicode);
558 return unicode;
559 }
560 else
561 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100562 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563}
564
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200565/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
566 ASCII, Latin1, UTF-8, etc. */
567static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200568backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200569 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
570{
Victor Stinnerad771582015-10-09 12:38:53 +0200571 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572 Py_UCS4 ch;
573 enum PyUnicode_Kind kind;
574 void *data;
575
576 assert(PyUnicode_IS_READY(unicode));
577 kind = PyUnicode_KIND(unicode);
578 data = PyUnicode_DATA(unicode);
579
580 size = 0;
581 /* determine replacement size */
582 for (i = collstart; i < collend; ++i) {
583 Py_ssize_t incr;
584
585 ch = PyUnicode_READ(kind, data, i);
586 if (ch < 0x100)
587 incr = 2+2;
588 else if (ch < 0x10000)
589 incr = 2+4;
590 else {
591 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200592 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 }
594 if (size > PY_SSIZE_T_MAX - incr) {
595 PyErr_SetString(PyExc_OverflowError,
596 "encoded result is too long for a Python string");
597 return NULL;
598 }
599 size += incr;
600 }
601
Victor Stinnerad771582015-10-09 12:38:53 +0200602 str = _PyBytesWriter_Prepare(writer, str, size);
603 if (str == NULL)
604 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605
606 /* generate replacement */
607 for (i = collstart; i < collend; ++i) {
608 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200609 *str++ = '\\';
610 if (ch >= 0x00010000) {
611 *str++ = 'U';
612 *str++ = Py_hexdigits[(ch>>28)&0xf];
613 *str++ = Py_hexdigits[(ch>>24)&0xf];
614 *str++ = Py_hexdigits[(ch>>20)&0xf];
615 *str++ = Py_hexdigits[(ch>>16)&0xf];
616 *str++ = Py_hexdigits[(ch>>12)&0xf];
617 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618 }
Victor Stinner797485e2015-10-09 03:17:30 +0200619 else if (ch >= 0x100) {
620 *str++ = 'u';
621 *str++ = Py_hexdigits[(ch>>12)&0xf];
622 *str++ = Py_hexdigits[(ch>>8)&0xf];
623 }
624 else
625 *str++ = 'x';
626 *str++ = Py_hexdigits[(ch>>4)&0xf];
627 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628 }
629 return str;
630}
631
632/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
633 ASCII, Latin1, UTF-8, etc. */
634static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200635xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
637{
Victor Stinnerad771582015-10-09 12:38:53 +0200638 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200639 Py_UCS4 ch;
640 enum PyUnicode_Kind kind;
641 void *data;
642
643 assert(PyUnicode_IS_READY(unicode));
644 kind = PyUnicode_KIND(unicode);
645 data = PyUnicode_DATA(unicode);
646
647 size = 0;
648 /* determine replacement size */
649 for (i = collstart; i < collend; ++i) {
650 Py_ssize_t incr;
651
652 ch = PyUnicode_READ(kind, data, i);
653 if (ch < 10)
654 incr = 2+1+1;
655 else if (ch < 100)
656 incr = 2+2+1;
657 else if (ch < 1000)
658 incr = 2+3+1;
659 else if (ch < 10000)
660 incr = 2+4+1;
661 else if (ch < 100000)
662 incr = 2+5+1;
663 else if (ch < 1000000)
664 incr = 2+6+1;
665 else {
666 assert(ch <= MAX_UNICODE);
667 incr = 2+7+1;
668 }
669 if (size > PY_SSIZE_T_MAX - incr) {
670 PyErr_SetString(PyExc_OverflowError,
671 "encoded result is too long for a Python string");
672 return NULL;
673 }
674 size += incr;
675 }
676
Victor Stinnerad771582015-10-09 12:38:53 +0200677 str = _PyBytesWriter_Prepare(writer, str, size);
678 if (str == NULL)
679 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200680
681 /* generate replacement */
682 for (i = collstart; i < collend; ++i) {
683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
684 }
685 return str;
686}
687
Thomas Wouters477c8d52006-05-27 19:21:47 +0000688/* --- Bloom Filters ----------------------------------------------------- */
689
690/* stuff to implement simple "bloom filters" for Unicode characters.
691 to keep things simple, we use a single bitmask, using the least 5
692 bits from each unicode characters as the bit index. */
693
694/* the linebreak mask is set up by Unicode_Init below */
695
Antoine Pitrouf068f942010-01-13 14:19:12 +0000696#if LONG_BIT >= 128
697#define BLOOM_WIDTH 128
698#elif LONG_BIT >= 64
699#define BLOOM_WIDTH 64
700#elif LONG_BIT >= 32
701#define BLOOM_WIDTH 32
702#else
703#error "LONG_BIT is smaller than 32"
704#endif
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706#define BLOOM_MASK unsigned long
707
Serhiy Storchaka05997252013-01-26 12:14:02 +0200708static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000709
Antoine Pitrouf068f942010-01-13 14:19:12 +0000710#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711
Benjamin Peterson29060642009-01-31 22:14:21 +0000712#define BLOOM_LINEBREAK(ch) \
713 ((ch) < 128U ? ascii_linebreak[(ch)] : \
714 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000715
Alexander Belopolsky40018472011-02-26 01:02:56 +0000716Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718{
Victor Stinnera85af502013-04-09 21:53:54 +0200719#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
720 do { \
721 TYPE *data = (TYPE *)PTR; \
722 TYPE *end = data + LEN; \
723 Py_UCS4 ch; \
724 for (; data != end; data++) { \
725 ch = *data; \
726 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
727 } \
728 break; \
729 } while (0)
730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* calculate simple bloom-style bitmask for a given unicode string */
732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
735 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200736 switch (kind) {
737 case PyUnicode_1BYTE_KIND:
738 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
739 break;
740 case PyUnicode_2BYTE_KIND:
741 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
742 break;
743 case PyUnicode_4BYTE_KIND:
744 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
745 break;
746 default:
747 assert(0);
748 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200750
751#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752}
753
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200754/* Compilation of templated routines */
755
756#include "stringlib/asciilib.h"
757#include "stringlib/fastsearch.h"
758#include "stringlib/partition.h"
759#include "stringlib/split.h"
760#include "stringlib/count.h"
761#include "stringlib/find.h"
762#include "stringlib/find_max_char.h"
763#include "stringlib/localeutil.h"
764#include "stringlib/undef.h"
765
766#include "stringlib/ucs1lib.h"
767#include "stringlib/fastsearch.h"
768#include "stringlib/partition.h"
769#include "stringlib/split.h"
770#include "stringlib/count.h"
771#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300772#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773#include "stringlib/find_max_char.h"
774#include "stringlib/localeutil.h"
775#include "stringlib/undef.h"
776
777#include "stringlib/ucs2lib.h"
778#include "stringlib/fastsearch.h"
779#include "stringlib/partition.h"
780#include "stringlib/split.h"
781#include "stringlib/count.h"
782#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300783#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200784#include "stringlib/find_max_char.h"
785#include "stringlib/localeutil.h"
786#include "stringlib/undef.h"
787
788#include "stringlib/ucs4lib.h"
789#include "stringlib/fastsearch.h"
790#include "stringlib/partition.h"
791#include "stringlib/split.h"
792#include "stringlib/count.h"
793#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300794#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200795#include "stringlib/find_max_char.h"
796#include "stringlib/localeutil.h"
797#include "stringlib/undef.h"
798
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200799#include "stringlib/unicodedefs.h"
800#include "stringlib/fastsearch.h"
801#include "stringlib/count.h"
802#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100803#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200804
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805/* --- Unicode Object ----------------------------------------------------- */
806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200808fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200810Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200811 Py_ssize_t size, Py_UCS4 ch,
812 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200814 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
815
816 switch (kind) {
817 case PyUnicode_1BYTE_KIND:
818 {
819 Py_UCS1 ch1 = (Py_UCS1) ch;
820 if (ch1 == ch)
821 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
822 else
823 return -1;
824 }
825 case PyUnicode_2BYTE_KIND:
826 {
827 Py_UCS2 ch2 = (Py_UCS2) ch;
828 if (ch2 == ch)
829 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
830 else
831 return -1;
832 }
833 case PyUnicode_4BYTE_KIND:
834 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
835 default:
836 assert(0);
837 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839}
840
Victor Stinnerafffce42012-10-03 23:03:17 +0200841#ifdef Py_DEBUG
842/* Fill the data of an Unicode string with invalid characters to detect bugs
843 earlier.
844
845 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
846 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
847 invalid character in Unicode 6.0. */
848static void
849unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
850{
851 int kind = PyUnicode_KIND(unicode);
852 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
853 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
854 if (length <= old_length)
855 return;
856 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
857}
858#endif
859
Victor Stinnerfe226c02011-10-03 03:52:20 +0200860static PyObject*
861resize_compact(PyObject *unicode, Py_ssize_t length)
862{
863 Py_ssize_t char_size;
864 Py_ssize_t struct_size;
865 Py_ssize_t new_size;
866 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100867 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200868#ifdef Py_DEBUG
869 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
870#endif
871
Victor Stinner79891572012-05-03 13:43:07 +0200872 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200873 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100874 assert(PyUnicode_IS_COMPACT(unicode));
875
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200876 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878 struct_size = sizeof(PyASCIIObject);
879 else
880 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200881 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200882
Victor Stinnerfe226c02011-10-03 03:52:20 +0200883 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
884 PyErr_NoMemory();
885 return NULL;
886 }
887 new_size = (struct_size + (length + 1) * char_size);
888
Victor Stinner84def372011-12-11 20:04:56 +0100889 _Py_DEC_REFTOTAL;
890 _Py_ForgetReference(unicode);
891
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300892 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100893 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100894 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895 PyErr_NoMemory();
896 return NULL;
897 }
Victor Stinner84def372011-12-11 20:04:56 +0100898 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200899 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200902 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200903 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100904 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100907 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_WSTR(unicode));
909 _PyUnicode_WSTR(unicode) = NULL;
910 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200911#ifdef Py_DEBUG
912 unicode_fill_invalid(unicode, old_length);
913#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200914 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
915 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200916 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917 return unicode;
918}
919
Alexander Belopolsky40018472011-02-26 01:02:56 +0000920static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200921resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922{
Victor Stinner95663112011-10-04 01:03:50 +0200923 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100924 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000927
Victor Stinnerfe226c02011-10-03 03:52:20 +0200928 if (PyUnicode_IS_READY(unicode)) {
929 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200930 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200931 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200932#ifdef Py_DEBUG
933 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
934#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935
936 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200937 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
939 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940
941 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
942 PyErr_NoMemory();
943 return -1;
944 }
945 new_size = (length + 1) * char_size;
946
Victor Stinner7a9105a2011-12-12 00:13:42 +0100947 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
948 {
949 PyObject_DEL(_PyUnicode_UTF8(unicode));
950 _PyUnicode_UTF8(unicode) = NULL;
951 _PyUnicode_UTF8_LENGTH(unicode) = 0;
952 }
953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 data = (PyObject *)PyObject_REALLOC(data, new_size);
955 if (data == NULL) {
956 PyErr_NoMemory();
957 return -1;
958 }
959 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200960 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200962 _PyUnicode_WSTR_LENGTH(unicode) = length;
963 }
964 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200965 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200966 _PyUnicode_UTF8_LENGTH(unicode) = length;
967 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 _PyUnicode_LENGTH(unicode) = length;
969 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200970#ifdef Py_DEBUG
971 unicode_fill_invalid(unicode, old_length);
972#endif
Victor Stinner95663112011-10-04 01:03:50 +0200973 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200974 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200975 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200976 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977 }
Victor Stinner95663112011-10-04 01:03:50 +0200978 assert(_PyUnicode_WSTR(unicode) != NULL);
979
980 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700981 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200982 PyErr_NoMemory();
983 return -1;
984 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100985 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200986 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100987 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200988 if (!wstr) {
989 PyErr_NoMemory();
990 return -1;
991 }
992 _PyUnicode_WSTR(unicode) = wstr;
993 _PyUnicode_WSTR(unicode)[length] = 0;
994 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200995 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996 return 0;
997}
998
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999static PyObject*
1000resize_copy(PyObject *unicode, Py_ssize_t length)
1001{
1002 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001003 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001005
Benjamin Petersonbac79492012-01-14 13:34:47 -05001006 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001007 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008
1009 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1010 if (copy == NULL)
1011 return NULL;
1012
1013 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001014 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001015 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001016 }
1017 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001018 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001019
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001020 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021 if (w == NULL)
1022 return NULL;
1023 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1024 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001025 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1026 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001027 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 }
1029}
1030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001032 Ux0000 terminated; some code (e.g. new_identifier)
1033 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034
1035 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001036 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037
1038*/
1039
Alexander Belopolsky40018472011-02-26 01:02:56 +00001040static PyUnicodeObject *
1041_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001043 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
Thomas Wouters477c8d52006-05-27 19:21:47 +00001046 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (length == 0 && unicode_empty != NULL) {
1048 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001049 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 }
1051
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001052 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001053 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001054 return (PyUnicodeObject *)PyErr_NoMemory();
1055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 if (length < 0) {
1057 PyErr_SetString(PyExc_SystemError,
1058 "Negative size passed to _PyUnicode_New");
1059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 }
1061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1063 if (unicode == NULL)
1064 return NULL;
1065 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001066
1067 _PyUnicode_WSTR_LENGTH(unicode) = length;
1068 _PyUnicode_HASH(unicode) = -1;
1069 _PyUnicode_STATE(unicode).interned = 0;
1070 _PyUnicode_STATE(unicode).kind = 0;
1071 _PyUnicode_STATE(unicode).compact = 0;
1072 _PyUnicode_STATE(unicode).ready = 0;
1073 _PyUnicode_STATE(unicode).ascii = 0;
1074 _PyUnicode_DATA_ANY(unicode) = NULL;
1075 _PyUnicode_LENGTH(unicode) = 0;
1076 _PyUnicode_UTF8(unicode) = NULL;
1077 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1080 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001081 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001083 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085
Jeremy Hyltond8082792003-09-16 19:41:39 +00001086 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001087 * the caller fails before initializing str -- unicode_resize()
1088 * reads str[0], and the Keep-Alive optimization can keep memory
1089 * allocated for str alive across a call to unicode_dealloc(unicode).
1090 * We don't want unicode_resize to read uninitialized memory in
1091 * that case.
1092 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 _PyUnicode_WSTR(unicode)[0] = 0;
1094 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001095
Victor Stinner7931d9a2011-11-04 00:22:48 +01001096 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 return unicode;
1098}
1099
Victor Stinnerf42dc442011-10-02 23:33:16 +02001100static const char*
1101unicode_kind_name(PyObject *unicode)
1102{
Victor Stinner42dfd712011-10-03 14:41:45 +02001103 /* don't check consistency: unicode_kind_name() is called from
1104 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001105 if (!PyUnicode_IS_COMPACT(unicode))
1106 {
1107 if (!PyUnicode_IS_READY(unicode))
1108 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001109 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001110 {
1111 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001112 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001113 return "legacy ascii";
1114 else
1115 return "legacy latin1";
1116 case PyUnicode_2BYTE_KIND:
1117 return "legacy UCS2";
1118 case PyUnicode_4BYTE_KIND:
1119 return "legacy UCS4";
1120 default:
1121 return "<legacy invalid kind>";
1122 }
1123 }
1124 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001125 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001127 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 return "ascii";
1129 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001130 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001132 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001133 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001134 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 default:
1136 return "<invalid compact kind>";
1137 }
1138}
1139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141/* Functions wrapping macros for use in debugger */
1142char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001143 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144}
1145
1146void *_PyUnicode_compact_data(void *unicode) {
1147 return _PyUnicode_COMPACT_DATA(unicode);
1148}
1149void *_PyUnicode_data(void *unicode){
1150 printf("obj %p\n", unicode);
1151 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1152 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1153 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1154 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1155 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1156 return PyUnicode_DATA(unicode);
1157}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001158
1159void
1160_PyUnicode_Dump(PyObject *op)
1161{
1162 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001163 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1164 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1165 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001166
Victor Stinnera849a4b2011-10-03 12:12:11 +02001167 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001168 {
1169 if (ascii->state.ascii)
1170 data = (ascii + 1);
1171 else
1172 data = (compact + 1);
1173 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001174 else
1175 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001176 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1177 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001178
Victor Stinnera849a4b2011-10-03 12:12:11 +02001179 if (ascii->wstr == data)
1180 printf("shared ");
1181 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001182
Victor Stinnera3b334d2011-10-03 13:53:37 +02001183 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001184 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001185 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1186 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001187 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1188 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001189 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001190 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001191}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192#endif
1193
1194PyObject *
1195PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1196{
1197 PyObject *obj;
1198 PyCompactUnicodeObject *unicode;
1199 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001200 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001201 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 Py_ssize_t char_size;
1203 Py_ssize_t struct_size;
1204
1205 /* Optimization for empty strings */
1206 if (size == 0 && unicode_empty != NULL) {
1207 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001208 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 }
1210
Victor Stinner9e9d6892011-10-04 01:02:02 +02001211 is_ascii = 0;
1212 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 struct_size = sizeof(PyCompactUnicodeObject);
1214 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001215 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 char_size = 1;
1217 is_ascii = 1;
1218 struct_size = sizeof(PyASCIIObject);
1219 }
1220 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001221 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 char_size = 1;
1223 }
1224 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001225 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 char_size = 2;
1227 if (sizeof(wchar_t) == 2)
1228 is_sharing = 1;
1229 }
1230 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001231 if (maxchar > MAX_UNICODE) {
1232 PyErr_SetString(PyExc_SystemError,
1233 "invalid maximum character passed to PyUnicode_New");
1234 return NULL;
1235 }
Victor Stinner8f825062012-04-27 13:55:39 +02001236 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 char_size = 4;
1238 if (sizeof(wchar_t) == 4)
1239 is_sharing = 1;
1240 }
1241
1242 /* Ensure we won't overflow the size. */
1243 if (size < 0) {
1244 PyErr_SetString(PyExc_SystemError,
1245 "Negative size passed to PyUnicode_New");
1246 return NULL;
1247 }
1248 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1249 return PyErr_NoMemory();
1250
1251 /* Duplicated allocation code from _PyObject_New() instead of a call to
1252 * PyObject_New() so we are able to allocate space for the object and
1253 * it's data buffer.
1254 */
1255 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1256 if (obj == NULL)
1257 return PyErr_NoMemory();
1258 obj = PyObject_INIT(obj, &PyUnicode_Type);
1259 if (obj == NULL)
1260 return NULL;
1261
1262 unicode = (PyCompactUnicodeObject *)obj;
1263 if (is_ascii)
1264 data = ((PyASCIIObject*)obj) + 1;
1265 else
1266 data = unicode + 1;
1267 _PyUnicode_LENGTH(unicode) = size;
1268 _PyUnicode_HASH(unicode) = -1;
1269 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001270 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271 _PyUnicode_STATE(unicode).compact = 1;
1272 _PyUnicode_STATE(unicode).ready = 1;
1273 _PyUnicode_STATE(unicode).ascii = is_ascii;
1274 if (is_ascii) {
1275 ((char*)data)[size] = 0;
1276 _PyUnicode_WSTR(unicode) = NULL;
1277 }
Victor Stinner8f825062012-04-27 13:55:39 +02001278 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 ((char*)data)[size] = 0;
1280 _PyUnicode_WSTR(unicode) = NULL;
1281 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001283 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 else {
1286 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001287 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001288 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001290 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291 ((Py_UCS4*)data)[size] = 0;
1292 if (is_sharing) {
1293 _PyUnicode_WSTR_LENGTH(unicode) = size;
1294 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1295 }
1296 else {
1297 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1298 _PyUnicode_WSTR(unicode) = NULL;
1299 }
1300 }
Victor Stinner8f825062012-04-27 13:55:39 +02001301#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001302 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001303#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001304 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return obj;
1306}
1307
1308#if SIZEOF_WCHAR_T == 2
1309/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1310 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001311 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312
1313 This function assumes that unicode can hold one more code point than wstr
1314 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001315static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001317 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318{
1319 const wchar_t *iter;
1320 Py_UCS4 *ucs4_out;
1321
Victor Stinner910337b2011-10-03 03:20:16 +02001322 assert(unicode != NULL);
1323 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1325 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1326
1327 for (iter = begin; iter < end; ) {
1328 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1329 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001330 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1331 && (iter+1) < end
1332 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 {
Victor Stinner551ac952011-11-29 22:58:13 +01001334 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 iter += 2;
1336 }
1337 else {
1338 *ucs4_out++ = *iter;
1339 iter++;
1340 }
1341 }
1342 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1343 _PyUnicode_GET_LENGTH(unicode)));
1344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345}
1346#endif
1347
Victor Stinnercd9950f2011-10-02 00:34:53 +02001348static int
Victor Stinner488fa492011-12-12 00:01:39 +01001349unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001350{
Victor Stinner488fa492011-12-12 00:01:39 +01001351 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001352 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001353 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001354 return -1;
1355 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001356 return 0;
1357}
1358
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001359static int
1360_copy_characters(PyObject *to, Py_ssize_t to_start,
1361 PyObject *from, Py_ssize_t from_start,
1362 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001364 unsigned int from_kind, to_kind;
1365 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366
Victor Stinneree4544c2012-05-09 22:24:08 +02001367 assert(0 <= how_many);
1368 assert(0 <= from_start);
1369 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001371 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001372 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
Victor Stinnerd3f08822012-05-29 12:57:52 +02001374 assert(PyUnicode_Check(to));
1375 assert(PyUnicode_IS_READY(to));
1376 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1377
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001378 if (how_many == 0)
1379 return 0;
1380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001382 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001384 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385
Victor Stinnerf1852262012-06-16 16:38:26 +02001386#ifdef Py_DEBUG
1387 if (!check_maxchar
1388 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1389 {
1390 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1391 Py_UCS4 ch;
1392 Py_ssize_t i;
1393 for (i=0; i < how_many; i++) {
1394 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1395 assert(ch <= to_maxchar);
1396 }
1397 }
1398#endif
1399
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001400 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001401 if (check_maxchar
1402 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1403 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 /* Writing Latin-1 characters into an ASCII string requires to
1405 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001406 Py_UCS4 max_char;
1407 max_char = ucs1lib_find_max_char(from_data,
1408 (Py_UCS1*)from_data + how_many);
1409 if (max_char >= 128)
1410 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001411 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001412 Py_MEMCPY((char*)to_data + to_kind * to_start,
1413 (char*)from_data + from_kind * from_start,
1414 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001416 else if (from_kind == PyUnicode_1BYTE_KIND
1417 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001418 {
1419 _PyUnicode_CONVERT_BYTES(
1420 Py_UCS1, Py_UCS2,
1421 PyUnicode_1BYTE_DATA(from) + from_start,
1422 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1423 PyUnicode_2BYTE_DATA(to) + to_start
1424 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001425 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001426 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001427 && to_kind == PyUnicode_4BYTE_KIND)
1428 {
1429 _PyUnicode_CONVERT_BYTES(
1430 Py_UCS1, Py_UCS4,
1431 PyUnicode_1BYTE_DATA(from) + from_start,
1432 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1433 PyUnicode_4BYTE_DATA(to) + to_start
1434 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001435 }
1436 else if (from_kind == PyUnicode_2BYTE_KIND
1437 && to_kind == PyUnicode_4BYTE_KIND)
1438 {
1439 _PyUnicode_CONVERT_BYTES(
1440 Py_UCS2, Py_UCS4,
1441 PyUnicode_2BYTE_DATA(from) + from_start,
1442 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1443 PyUnicode_4BYTE_DATA(to) + to_start
1444 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001445 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001446 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001447 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1448
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001449 if (!check_maxchar) {
1450 if (from_kind == PyUnicode_2BYTE_KIND
1451 && to_kind == PyUnicode_1BYTE_KIND)
1452 {
1453 _PyUnicode_CONVERT_BYTES(
1454 Py_UCS2, Py_UCS1,
1455 PyUnicode_2BYTE_DATA(from) + from_start,
1456 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1457 PyUnicode_1BYTE_DATA(to) + to_start
1458 );
1459 }
1460 else if (from_kind == PyUnicode_4BYTE_KIND
1461 && to_kind == PyUnicode_1BYTE_KIND)
1462 {
1463 _PyUnicode_CONVERT_BYTES(
1464 Py_UCS4, Py_UCS1,
1465 PyUnicode_4BYTE_DATA(from) + from_start,
1466 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1467 PyUnicode_1BYTE_DATA(to) + to_start
1468 );
1469 }
1470 else if (from_kind == PyUnicode_4BYTE_KIND
1471 && to_kind == PyUnicode_2BYTE_KIND)
1472 {
1473 _PyUnicode_CONVERT_BYTES(
1474 Py_UCS4, Py_UCS2,
1475 PyUnicode_4BYTE_DATA(from) + from_start,
1476 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1477 PyUnicode_2BYTE_DATA(to) + to_start
1478 );
1479 }
1480 else {
1481 assert(0);
1482 return -1;
1483 }
1484 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001485 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001487 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 Py_ssize_t i;
1489
Victor Stinnera0702ab2011-09-29 14:14:38 +02001490 for (i=0; i < how_many; i++) {
1491 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001492 if (ch > to_maxchar)
1493 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001494 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1495 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001496 }
1497 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001498 return 0;
1499}
1500
Victor Stinnerd3f08822012-05-29 12:57:52 +02001501void
1502_PyUnicode_FastCopyCharacters(
1503 PyObject *to, Py_ssize_t to_start,
1504 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001505{
1506 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1507}
1508
1509Py_ssize_t
1510PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1511 PyObject *from, Py_ssize_t from_start,
1512 Py_ssize_t how_many)
1513{
1514 int err;
1515
1516 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1517 PyErr_BadInternalCall();
1518 return -1;
1519 }
1520
Benjamin Petersonbac79492012-01-14 13:34:47 -05001521 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001522 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001523 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return -1;
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526 if (from_start < 0) {
1527 PyErr_SetString(PyExc_IndexError, "string index out of range");
1528 return -1;
1529 }
1530 if (to_start < 0) {
1531 PyErr_SetString(PyExc_IndexError, "string index out of range");
1532 return -1;
1533 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001534 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1535 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1536 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001537 "Cannot write %zi characters at %zi "
1538 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 how_many, to_start, PyUnicode_GET_LENGTH(to));
1540 return -1;
1541 }
1542
1543 if (how_many == 0)
1544 return 0;
1545
Victor Stinner488fa492011-12-12 00:01:39 +01001546 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
1548
1549 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1550 if (err) {
1551 PyErr_Format(PyExc_SystemError,
1552 "Cannot copy %s characters "
1553 "into a string of %s characters",
1554 unicode_kind_name(from),
1555 unicode_kind_name(to));
1556 return -1;
1557 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001558 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559}
1560
Victor Stinner17222162011-09-28 22:15:37 +02001561/* Find the maximum code point and count the number of surrogate pairs so a
1562 correct string length can be computed before converting a string to UCS4.
1563 This function counts single surrogates as a character and not as a pair.
1564
1565 Return 0 on success, or -1 on error. */
1566static int
1567find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1568 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569{
1570 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001571 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572
Victor Stinnerc53be962011-10-02 21:33:54 +02001573 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 *num_surrogates = 0;
1575 *maxchar = 0;
1576
1577 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001579 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1580 && (iter+1) < end
1581 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1582 {
1583 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1584 ++(*num_surrogates);
1585 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 }
1587 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001589 {
1590 ch = *iter;
1591 iter++;
1592 }
1593 if (ch > *maxchar) {
1594 *maxchar = ch;
1595 if (*maxchar > MAX_UNICODE) {
1596 PyErr_Format(PyExc_ValueError,
1597 "character U+%x is not in range [U+0000; U+10ffff]",
1598 ch);
1599 return -1;
1600 }
1601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 }
1603 return 0;
1604}
1605
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001606int
1607_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608{
1609 wchar_t *end;
1610 Py_UCS4 maxchar = 0;
1611 Py_ssize_t num_surrogates;
1612#if SIZEOF_WCHAR_T == 2
1613 Py_ssize_t length_wo_surrogates;
1614#endif
1615
Georg Brandl7597add2011-10-05 16:36:47 +02001616 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001617 strings were created using _PyObject_New() and where no canonical
1618 representation (the str field) has been set yet aka strings
1619 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001620 assert(_PyUnicode_CHECK(unicode));
1621 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001623 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001624 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001625 /* Actually, it should neither be interned nor be anything else: */
1626 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001629 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001630 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632
1633 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001634 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1635 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 PyErr_NoMemory();
1637 return -1;
1638 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001639 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 _PyUnicode_WSTR(unicode), end,
1641 PyUnicode_1BYTE_DATA(unicode));
1642 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1643 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1644 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1645 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001646 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001647 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001648 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 }
1650 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001651 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001652 _PyUnicode_UTF8(unicode) = NULL;
1653 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 }
1655 PyObject_FREE(_PyUnicode_WSTR(unicode));
1656 _PyUnicode_WSTR(unicode) = NULL;
1657 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1658 }
1659 /* In this case we might have to convert down from 4-byte native
1660 wchar_t to 2-byte unicode. */
1661 else if (maxchar < 65536) {
1662 assert(num_surrogates == 0 &&
1663 "FindMaxCharAndNumSurrogatePairs() messed up");
1664
Victor Stinner506f5922011-09-28 22:34:18 +02001665#if SIZEOF_WCHAR_T == 2
1666 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001667 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001668 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1669 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1670 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001671 _PyUnicode_UTF8(unicode) = NULL;
1672 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001673#else
1674 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001675 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001676 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001678 PyErr_NoMemory();
1679 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 }
Victor Stinner506f5922011-09-28 22:34:18 +02001681 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1682 _PyUnicode_WSTR(unicode), end,
1683 PyUnicode_2BYTE_DATA(unicode));
1684 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1685 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1686 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001687 _PyUnicode_UTF8(unicode) = NULL;
1688 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001689 PyObject_FREE(_PyUnicode_WSTR(unicode));
1690 _PyUnicode_WSTR(unicode) = NULL;
1691 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1692#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1695 else {
1696#if SIZEOF_WCHAR_T == 2
1697 /* in case the native representation is 2-bytes, we need to allocate a
1698 new normalized 4-byte version. */
1699 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001700 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1701 PyErr_NoMemory();
1702 return -1;
1703 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
1709 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1710 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001711 _PyUnicode_UTF8(unicode) = NULL;
1712 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001713 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1714 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001715 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 PyObject_FREE(_PyUnicode_WSTR(unicode));
1717 _PyUnicode_WSTR(unicode) = NULL;
1718 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1719#else
1720 assert(num_surrogates == 0);
1721
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001724 _PyUnicode_UTF8(unicode) = NULL;
1725 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1727#endif
1728 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1729 }
1730 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001731 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 return 0;
1733}
1734
Alexander Belopolsky40018472011-02-26 01:02:56 +00001735static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001736unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737{
Walter Dörwald16807132007-05-25 13:52:07 +00001738 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001739 case SSTATE_NOT_INTERNED:
1740 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001741
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 case SSTATE_INTERNED_MORTAL:
1743 /* revive dead object temporarily for DelItem */
1744 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001745 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001746 Py_FatalError(
1747 "deletion of interned string failed");
1748 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 case SSTATE_INTERNED_IMMORTAL:
1751 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001752
Benjamin Peterson29060642009-01-31 22:14:21 +00001753 default:
1754 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001755 }
1756
Victor Stinner03490912011-10-03 23:45:12 +02001757 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001759 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001760 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001761 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1762 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001764 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765}
1766
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001767#ifdef Py_DEBUG
1768static int
1769unicode_is_singleton(PyObject *unicode)
1770{
1771 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1772 if (unicode == unicode_empty)
1773 return 1;
1774 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1775 {
1776 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1777 if (ch < 256 && unicode_latin1[ch] == unicode)
1778 return 1;
1779 }
1780 return 0;
1781}
1782#endif
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784static int
Victor Stinner488fa492011-12-12 00:01:39 +01001785unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001786{
Victor Stinner488fa492011-12-12 00:01:39 +01001787 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001788 if (Py_REFCNT(unicode) != 1)
1789 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001790 if (_PyUnicode_HASH(unicode) != -1)
1791 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001792 if (PyUnicode_CHECK_INTERNED(unicode))
1793 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001794 if (!PyUnicode_CheckExact(unicode))
1795 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001796#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797 /* singleton refcount is greater than 1 */
1798 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001799#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001800 return 1;
1801}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001802
Victor Stinnerfe226c02011-10-03 03:52:20 +02001803static int
1804unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1805{
1806 PyObject *unicode;
1807 Py_ssize_t old_length;
1808
1809 assert(p_unicode != NULL);
1810 unicode = *p_unicode;
1811
1812 assert(unicode != NULL);
1813 assert(PyUnicode_Check(unicode));
1814 assert(0 <= length);
1815
Victor Stinner910337b2011-10-03 03:20:16 +02001816 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 old_length = PyUnicode_WSTR_LENGTH(unicode);
1818 else
1819 old_length = PyUnicode_GET_LENGTH(unicode);
1820 if (old_length == length)
1821 return 0;
1822
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001823 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001824 _Py_INCREF_UNICODE_EMPTY();
1825 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001826 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001827 Py_DECREF(*p_unicode);
1828 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001829 return 0;
1830 }
1831
Victor Stinner488fa492011-12-12 00:01:39 +01001832 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 PyObject *copy = resize_copy(unicode, length);
1834 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001836 Py_DECREF(*p_unicode);
1837 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001838 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001839 }
1840
Victor Stinnerfe226c02011-10-03 03:52:20 +02001841 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001842 PyObject *new_unicode = resize_compact(unicode, length);
1843 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001844 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001845 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001846 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001848 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001849}
1850
Alexander Belopolsky40018472011-02-26 01:02:56 +00001851int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001852PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001853{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001854 PyObject *unicode;
1855 if (p_unicode == NULL) {
1856 PyErr_BadInternalCall();
1857 return -1;
1858 }
1859 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001860 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001861 {
1862 PyErr_BadInternalCall();
1863 return -1;
1864 }
1865 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001866}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867
Victor Stinnerc5166102012-02-22 13:55:02 +01001868/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001869
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001870 WARNING: The function doesn't copy the terminating null character and
1871 doesn't check the maximum character (may write a latin1 character in an
1872 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001873static void
1874unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1875 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001876{
1877 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1878 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001879 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001880
1881 switch (kind) {
1882 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001883 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001884#ifdef Py_DEBUG
1885 if (PyUnicode_IS_ASCII(unicode)) {
1886 Py_UCS4 maxchar = ucs1lib_find_max_char(
1887 (const Py_UCS1*)str,
1888 (const Py_UCS1*)str + len);
1889 assert(maxchar < 128);
1890 }
1891#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001892 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001893 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001894 }
1895 case PyUnicode_2BYTE_KIND: {
1896 Py_UCS2 *start = (Py_UCS2 *)data + index;
1897 Py_UCS2 *ucs2 = start;
1898 assert(index <= PyUnicode_GET_LENGTH(unicode));
1899
Victor Stinner184252a2012-06-16 02:57:41 +02001900 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001901 *ucs2 = (Py_UCS2)*str;
1902
1903 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001904 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001905 }
1906 default: {
1907 Py_UCS4 *start = (Py_UCS4 *)data + index;
1908 Py_UCS4 *ucs4 = start;
1909 assert(kind == PyUnicode_4BYTE_KIND);
1910 assert(index <= PyUnicode_GET_LENGTH(unicode));
1911
Victor Stinner184252a2012-06-16 02:57:41 +02001912 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001913 *ucs4 = (Py_UCS4)*str;
1914
1915 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001916 }
1917 }
1918}
1919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920static PyObject*
1921get_latin1_char(unsigned char ch)
1922{
Victor Stinnera464fc12011-10-02 20:39:30 +02001923 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001925 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!unicode)
1927 return NULL;
1928 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001929 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 unicode_latin1[ch] = unicode;
1931 }
1932 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001933 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934}
1935
Victor Stinner985a82a2014-01-03 12:53:47 +01001936static PyObject*
1937unicode_char(Py_UCS4 ch)
1938{
1939 PyObject *unicode;
1940
1941 assert(ch <= MAX_UNICODE);
1942
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001943 if (ch < 256)
1944 return get_latin1_char(ch);
1945
Victor Stinner985a82a2014-01-03 12:53:47 +01001946 unicode = PyUnicode_New(1, ch);
1947 if (unicode == NULL)
1948 return NULL;
1949 switch (PyUnicode_KIND(unicode)) {
1950 case PyUnicode_1BYTE_KIND:
1951 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1952 break;
1953 case PyUnicode_2BYTE_KIND:
1954 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1955 break;
1956 default:
1957 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1958 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1959 }
1960 assert(_PyUnicode_CheckConsistency(unicode, 1));
1961 return unicode;
1962}
1963
Alexander Belopolsky40018472011-02-26 01:02:56 +00001964PyObject *
1965PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001967 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 Py_UCS4 maxchar = 0;
1969 Py_ssize_t num_surrogates;
1970
1971 if (u == NULL)
1972 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001974 /* If the Unicode data is known at construction time, we can apply
1975 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001978 if (size == 0)
1979 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 /* Single character Unicode objects in the Latin-1 range are
1982 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001983 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 return get_latin1_char((unsigned char)*u);
1985
1986 /* If not empty and not single character, copy the Unicode data
1987 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001988 if (find_maxchar_surrogates(u, u + size,
1989 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return NULL;
1991
Victor Stinner8faf8212011-12-08 22:14:11 +01001992 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 if (!unicode)
1994 return NULL;
1995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 switch (PyUnicode_KIND(unicode)) {
1997 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001998 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2000 break;
2001 case PyUnicode_2BYTE_KIND:
2002#if Py_UNICODE_SIZE == 2
2003 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2004#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002005 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2007#endif
2008 break;
2009 case PyUnicode_4BYTE_KIND:
2010#if SIZEOF_WCHAR_T == 2
2011 /* This is the only case which has to process surrogates, thus
2012 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002013 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014#else
2015 assert(num_surrogates == 0);
2016 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2017#endif
2018 break;
2019 default:
2020 assert(0 && "Impossible state");
2021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024}
2025
Alexander Belopolsky40018472011-02-26 01:02:56 +00002026PyObject *
2027PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002028{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002029 if (size < 0) {
2030 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002032 return NULL;
2033 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002034 if (u != NULL)
2035 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2036 else
2037 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002038}
2039
Alexander Belopolsky40018472011-02-26 01:02:56 +00002040PyObject *
2041PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002042{
2043 size_t size = strlen(u);
2044 if (size > PY_SSIZE_T_MAX) {
2045 PyErr_SetString(PyExc_OverflowError, "input too long");
2046 return NULL;
2047 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002048 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002049}
2050
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002051PyObject *
2052_PyUnicode_FromId(_Py_Identifier *id)
2053{
2054 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002055 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2056 strlen(id->string),
2057 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002058 if (!id->object)
2059 return NULL;
2060 PyUnicode_InternInPlace(&id->object);
2061 assert(!id->next);
2062 id->next = static_strings;
2063 static_strings = id;
2064 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002065 return id->object;
2066}
2067
2068void
2069_PyUnicode_ClearStaticStrings()
2070{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002071 _Py_Identifier *tmp, *s = static_strings;
2072 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002073 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002074 tmp = s->next;
2075 s->next = NULL;
2076 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002077 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002078 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079}
2080
Benjamin Peterson0df54292012-03-26 14:50:32 -04002081/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002082
Victor Stinnerd3f08822012-05-29 12:57:52 +02002083PyObject*
2084_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002085{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002086 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002087 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002088 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002089#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002090 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002091#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002092 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002093 }
Victor Stinner785938e2011-12-11 20:09:03 +01002094 unicode = PyUnicode_New(size, 127);
2095 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002096 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002097 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2098 assert(_PyUnicode_CheckConsistency(unicode, 1));
2099 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002100}
2101
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002102static Py_UCS4
2103kind_maxchar_limit(unsigned int kind)
2104{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002105 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002106 case PyUnicode_1BYTE_KIND:
2107 return 0x80;
2108 case PyUnicode_2BYTE_KIND:
2109 return 0x100;
2110 case PyUnicode_4BYTE_KIND:
2111 return 0x10000;
2112 default:
2113 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002114 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002115 }
2116}
2117
Victor Stinnere6abb482012-05-02 01:15:40 +02002118Py_LOCAL_INLINE(Py_UCS4)
2119align_maxchar(Py_UCS4 maxchar)
2120{
2121 if (maxchar <= 127)
2122 return 127;
2123 else if (maxchar <= 255)
2124 return 255;
2125 else if (maxchar <= 65535)
2126 return 65535;
2127 else
2128 return MAX_UNICODE;
2129}
2130
Victor Stinner702c7342011-10-05 13:50:52 +02002131static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002132_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002135 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002136
Serhiy Storchaka678db842013-01-26 12:16:36 +02002137 if (size == 0)
2138 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002139 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002140 if (size == 1)
2141 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002143 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002144 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 if (!res)
2146 return NULL;
2147 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002148 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002150}
2151
Victor Stinnere57b1c02011-09-28 22:20:48 +02002152static PyObject*
2153_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154{
2155 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002156 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002157
Serhiy Storchaka678db842013-01-26 12:16:36 +02002158 if (size == 0)
2159 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002160 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 if (size == 1)
2162 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002164 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002165 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (!res)
2167 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002170 else {
2171 _PyUnicode_CONVERT_BYTES(
2172 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2173 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002174 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 return res;
2176}
2177
Victor Stinnere57b1c02011-09-28 22:20:48 +02002178static PyObject*
2179_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180{
2181 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002182 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002183
Serhiy Storchaka678db842013-01-26 12:16:36 +02002184 if (size == 0)
2185 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002187 if (size == 1)
2188 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002189
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002190 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 if (!res)
2193 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002194 if (max_char < 256)
2195 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2196 PyUnicode_1BYTE_DATA(res));
2197 else if (max_char < 0x10000)
2198 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2199 PyUnicode_2BYTE_DATA(res));
2200 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
2206PyObject*
2207PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2208{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002209 if (size < 0) {
2210 PyErr_SetString(PyExc_ValueError, "size must be positive");
2211 return NULL;
2212 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002213 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002215 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002217 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002219 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002220 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
2222 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224}
2225
Victor Stinnerece58de2012-04-23 23:36:38 +02002226Py_UCS4
2227_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2228{
2229 enum PyUnicode_Kind kind;
2230 void *startptr, *endptr;
2231
2232 assert(PyUnicode_IS_READY(unicode));
2233 assert(0 <= start);
2234 assert(end <= PyUnicode_GET_LENGTH(unicode));
2235 assert(start <= end);
2236
2237 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2238 return PyUnicode_MAX_CHAR_VALUE(unicode);
2239
2240 if (start == end)
2241 return 127;
2242
Victor Stinner94d558b2012-04-27 22:26:58 +02002243 if (PyUnicode_IS_ASCII(unicode))
2244 return 127;
2245
Victor Stinnerece58de2012-04-23 23:36:38 +02002246 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002247 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002248 endptr = (char *)startptr + end * kind;
2249 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002250 switch(kind) {
2251 case PyUnicode_1BYTE_KIND:
2252 return ucs1lib_find_max_char(startptr, endptr);
2253 case PyUnicode_2BYTE_KIND:
2254 return ucs2lib_find_max_char(startptr, endptr);
2255 case PyUnicode_4BYTE_KIND:
2256 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002257 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002258 assert(0);
2259 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002260 }
2261}
2262
Victor Stinner25a4b292011-10-06 12:31:55 +02002263/* Ensure that a string uses the most efficient storage, if it is not the
2264 case: create a new string with of the right kind. Write NULL into *p_unicode
2265 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002266static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002267unicode_adjust_maxchar(PyObject **p_unicode)
2268{
2269 PyObject *unicode, *copy;
2270 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002271 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002272 unsigned int kind;
2273
2274 assert(p_unicode != NULL);
2275 unicode = *p_unicode;
2276 assert(PyUnicode_IS_READY(unicode));
2277 if (PyUnicode_IS_ASCII(unicode))
2278 return;
2279
2280 len = PyUnicode_GET_LENGTH(unicode);
2281 kind = PyUnicode_KIND(unicode);
2282 if (kind == PyUnicode_1BYTE_KIND) {
2283 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002284 max_char = ucs1lib_find_max_char(u, u + len);
2285 if (max_char >= 128)
2286 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002287 }
2288 else if (kind == PyUnicode_2BYTE_KIND) {
2289 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002290 max_char = ucs2lib_find_max_char(u, u + len);
2291 if (max_char >= 256)
2292 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002293 }
2294 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002297 max_char = ucs4lib_find_max_char(u, u + len);
2298 if (max_char >= 0x10000)
2299 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002301 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002302 if (copy != NULL)
2303 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002304 Py_DECREF(unicode);
2305 *p_unicode = copy;
2306}
2307
Victor Stinner034f6cf2011-09-30 02:26:44 +02002308PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002309_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002310{
Victor Stinner87af4f22011-11-21 23:03:47 +01002311 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002312 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002313
Victor Stinner034f6cf2011-09-30 02:26:44 +02002314 if (!PyUnicode_Check(unicode)) {
2315 PyErr_BadInternalCall();
2316 return NULL;
2317 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002318 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002319 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002320
Victor Stinner87af4f22011-11-21 23:03:47 +01002321 length = PyUnicode_GET_LENGTH(unicode);
2322 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002323 if (!copy)
2324 return NULL;
2325 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2326
Victor Stinner87af4f22011-11-21 23:03:47 +01002327 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2328 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002329 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002330 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331}
2332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333
Victor Stinnerbc603d12011-10-02 01:00:40 +02002334/* Widen Unicode objects to larger buffers. Don't write terminating null
2335 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336
2337void*
2338_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2339{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002340 Py_ssize_t len;
2341 void *result;
2342 unsigned int skind;
2343
Benjamin Petersonbac79492012-01-14 13:34:47 -05002344 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002345 return NULL;
2346
2347 len = PyUnicode_GET_LENGTH(s);
2348 skind = PyUnicode_KIND(s);
2349 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002350 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return NULL;
2352 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002353 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002354 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002355 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002356 if (!result)
2357 return PyErr_NoMemory();
2358 assert(skind == PyUnicode_1BYTE_KIND);
2359 _PyUnicode_CONVERT_BYTES(
2360 Py_UCS1, Py_UCS2,
2361 PyUnicode_1BYTE_DATA(s),
2362 PyUnicode_1BYTE_DATA(s) + len,
2363 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002365 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002366 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002367 if (!result)
2368 return PyErr_NoMemory();
2369 if (skind == PyUnicode_2BYTE_KIND) {
2370 _PyUnicode_CONVERT_BYTES(
2371 Py_UCS2, Py_UCS4,
2372 PyUnicode_2BYTE_DATA(s),
2373 PyUnicode_2BYTE_DATA(s) + len,
2374 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002376 else {
2377 assert(skind == PyUnicode_1BYTE_KIND);
2378 _PyUnicode_CONVERT_BYTES(
2379 Py_UCS1, Py_UCS4,
2380 PyUnicode_1BYTE_DATA(s),
2381 PyUnicode_1BYTE_DATA(s) + len,
2382 result);
2383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002385 default:
2386 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 }
Victor Stinner01698042011-10-04 00:04:26 +02002388 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 return NULL;
2390}
2391
2392static Py_UCS4*
2393as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2394 int copy_null)
2395{
2396 int kind;
2397 void *data;
2398 Py_ssize_t len, targetlen;
2399 if (PyUnicode_READY(string) == -1)
2400 return NULL;
2401 kind = PyUnicode_KIND(string);
2402 data = PyUnicode_DATA(string);
2403 len = PyUnicode_GET_LENGTH(string);
2404 targetlen = len;
2405 if (copy_null)
2406 targetlen++;
2407 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002408 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 if (!target) {
2410 PyErr_NoMemory();
2411 return NULL;
2412 }
2413 }
2414 else {
2415 if (targetsize < targetlen) {
2416 PyErr_Format(PyExc_SystemError,
2417 "string is longer than the buffer");
2418 if (copy_null && 0 < targetsize)
2419 target[0] = 0;
2420 return NULL;
2421 }
2422 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002423 if (kind == PyUnicode_1BYTE_KIND) {
2424 Py_UCS1 *start = (Py_UCS1 *) data;
2425 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002427 else if (kind == PyUnicode_2BYTE_KIND) {
2428 Py_UCS2 *start = (Py_UCS2 *) data;
2429 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2430 }
2431 else {
2432 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 if (copy_null)
2436 target[len] = 0;
2437 return target;
2438}
2439
2440Py_UCS4*
2441PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2442 int copy_null)
2443{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002444 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 PyErr_BadInternalCall();
2446 return NULL;
2447 }
2448 return as_ucs4(string, target, targetsize, copy_null);
2449}
2450
2451Py_UCS4*
2452PyUnicode_AsUCS4Copy(PyObject *string)
2453{
2454 return as_ucs4(string, NULL, 0, 1);
2455}
2456
2457#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002458
Alexander Belopolsky40018472011-02-26 01:02:56 +00002459PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002460PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002464 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 PyErr_BadInternalCall();
2466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 }
2468
Martin v. Löwis790465f2008-04-05 20:41:37 +00002469 if (size == -1) {
2470 size = wcslen(w);
2471 }
2472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474}
2475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002477
Victor Stinner15a11362012-10-06 23:48:20 +02002478/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002479 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2480 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2481#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002482
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002483static int
2484unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2485 Py_ssize_t width, Py_ssize_t precision)
2486{
2487 Py_ssize_t length, fill, arglen;
2488 Py_UCS4 maxchar;
2489
2490 if (PyUnicode_READY(str) == -1)
2491 return -1;
2492
2493 length = PyUnicode_GET_LENGTH(str);
2494 if ((precision == -1 || precision >= length)
2495 && width <= length)
2496 return _PyUnicodeWriter_WriteStr(writer, str);
2497
2498 if (precision != -1)
2499 length = Py_MIN(precision, length);
2500
2501 arglen = Py_MAX(length, width);
2502 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2503 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2504 else
2505 maxchar = writer->maxchar;
2506
2507 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2508 return -1;
2509
2510 if (width > length) {
2511 fill = width - length;
2512 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2513 return -1;
2514 writer->pos += fill;
2515 }
2516
2517 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2518 str, 0, length);
2519 writer->pos += length;
2520 return 0;
2521}
2522
2523static int
2524unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2525 Py_ssize_t width, Py_ssize_t precision)
2526{
2527 /* UTF-8 */
2528 Py_ssize_t length;
2529 PyObject *unicode;
2530 int res;
2531
2532 length = strlen(str);
2533 if (precision != -1)
2534 length = Py_MIN(length, precision);
2535 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2536 if (unicode == NULL)
2537 return -1;
2538
2539 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2540 Py_DECREF(unicode);
2541 return res;
2542}
2543
Victor Stinner96865452011-03-01 23:44:09 +00002544static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002545unicode_fromformat_arg(_PyUnicodeWriter *writer,
2546 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002547{
Victor Stinnere215d962012-10-06 23:03:36 +02002548 const char *p;
2549 Py_ssize_t len;
2550 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002551 Py_ssize_t width;
2552 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 int longflag;
2554 int longlongflag;
2555 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002556 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002557
2558 p = f;
2559 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002560 zeropad = 0;
2561 if (*f == '0') {
2562 zeropad = 1;
2563 f++;
2564 }
Victor Stinner96865452011-03-01 23:44:09 +00002565
2566 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567 width = -1;
2568 if (Py_ISDIGIT((unsigned)*f)) {
2569 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002570 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002571 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002573 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002575 return NULL;
2576 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002577 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002578 f++;
2579 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 }
2581 precision = -1;
2582 if (*f == '.') {
2583 f++;
2584 if (Py_ISDIGIT((unsigned)*f)) {
2585 precision = (*f - '0');
2586 f++;
2587 while (Py_ISDIGIT((unsigned)*f)) {
2588 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2589 PyErr_SetString(PyExc_ValueError,
2590 "precision too big");
2591 return NULL;
2592 }
2593 precision = (precision * 10) + (*f - '0');
2594 f++;
2595 }
2596 }
Victor Stinner96865452011-03-01 23:44:09 +00002597 if (*f == '%') {
2598 /* "%.3%s" => f points to "3" */
2599 f--;
2600 }
2601 }
2602 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002603 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002604 f--;
2605 }
Victor Stinner96865452011-03-01 23:44:09 +00002606
2607 /* Handle %ld, %lu, %lld and %llu. */
2608 longflag = 0;
2609 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002610 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002611 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002612 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002613 longflag = 1;
2614 ++f;
2615 }
2616#ifdef HAVE_LONG_LONG
2617 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002618 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002619 longlongflag = 1;
2620 f += 2;
2621 }
2622#endif
2623 }
2624 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002625 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002626 size_tflag = 1;
2627 ++f;
2628 }
Victor Stinnere215d962012-10-06 23:03:36 +02002629
2630 if (f[1] == '\0')
2631 writer->overallocate = 0;
2632
2633 switch (*f) {
2634 case 'c':
2635 {
2636 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002637 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002638 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002639 "character argument not in range(0x110000)");
2640 return NULL;
2641 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002642 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002643 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002644 break;
2645 }
2646
2647 case 'i':
2648 case 'd':
2649 case 'u':
2650 case 'x':
2651 {
2652 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002653 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002654 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002657 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002658 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002659 va_arg(*vargs, unsigned long));
2660#ifdef HAVE_LONG_LONG
2661 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002662 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002663 va_arg(*vargs, unsigned PY_LONG_LONG));
2664#endif
2665 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002666 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002667 va_arg(*vargs, size_t));
2668 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002669 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002670 va_arg(*vargs, unsigned int));
2671 }
2672 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002673 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002674 }
2675 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002677 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002678 va_arg(*vargs, long));
2679#ifdef HAVE_LONG_LONG
2680 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002682 va_arg(*vargs, PY_LONG_LONG));
2683#endif
2684 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, Py_ssize_t));
2687 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, int));
2690 }
2691 assert(len >= 0);
2692
Victor Stinnere215d962012-10-06 23:03:36 +02002693 if (precision < len)
2694 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002695
2696 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002697 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2698 return NULL;
2699
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (width > precision) {
2701 Py_UCS4 fillchar;
2702 fill = width - precision;
2703 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002704 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2705 return NULL;
2706 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002707 }
Victor Stinner15a11362012-10-06 23:48:20 +02002708 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002709 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002710 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2711 return NULL;
2712 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002713 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714
Victor Stinner4a587072013-11-19 12:54:53 +01002715 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2716 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002717 break;
2718 }
2719
2720 case 'p':
2721 {
2722 char number[MAX_LONG_LONG_CHARS];
2723
2724 len = sprintf(number, "%p", va_arg(*vargs, void*));
2725 assert(len >= 0);
2726
2727 /* %p is ill-defined: ensure leading 0x. */
2728 if (number[1] == 'X')
2729 number[1] = 'x';
2730 else if (number[1] != 'x') {
2731 memmove(number + 2, number,
2732 strlen(number) + 1);
2733 number[0] = '0';
2734 number[1] = 'x';
2735 len += 2;
2736 }
2737
Victor Stinner4a587072013-11-19 12:54:53 +01002738 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002739 return NULL;
2740 break;
2741 }
2742
2743 case 's':
2744 {
2745 /* UTF-8 */
2746 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002748 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002749 break;
2750 }
2751
2752 case 'U':
2753 {
2754 PyObject *obj = va_arg(*vargs, PyObject *);
2755 assert(obj && _PyUnicode_CHECK(obj));
2756
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002758 return NULL;
2759 break;
2760 }
2761
2762 case 'V':
2763 {
2764 PyObject *obj = va_arg(*vargs, PyObject *);
2765 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002766 if (obj) {
2767 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002768 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002769 return NULL;
2770 }
2771 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002772 assert(str != NULL);
2773 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002775 }
2776 break;
2777 }
2778
2779 case 'S':
2780 {
2781 PyObject *obj = va_arg(*vargs, PyObject *);
2782 PyObject *str;
2783 assert(obj);
2784 str = PyObject_Str(obj);
2785 if (!str)
2786 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002787 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002788 Py_DECREF(str);
2789 return NULL;
2790 }
2791 Py_DECREF(str);
2792 break;
2793 }
2794
2795 case 'R':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 PyObject *repr;
2799 assert(obj);
2800 repr = PyObject_Repr(obj);
2801 if (!repr)
2802 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 Py_DECREF(repr);
2805 return NULL;
2806 }
2807 Py_DECREF(repr);
2808 break;
2809 }
2810
2811 case 'A':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 PyObject *ascii;
2815 assert(obj);
2816 ascii = PyObject_ASCII(obj);
2817 if (!ascii)
2818 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002819 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002820 Py_DECREF(ascii);
2821 return NULL;
2822 }
2823 Py_DECREF(ascii);
2824 break;
2825 }
2826
2827 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002828 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002829 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002830 break;
2831
2832 default:
2833 /* if we stumble upon an unknown formatting code, copy the rest
2834 of the format string to the output string. (we cannot just
2835 skip the code, since there's no way to know what's in the
2836 argument list) */
2837 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002838 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002839 return NULL;
2840 f = p+len;
2841 return f;
2842 }
2843
2844 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002845 return f;
2846}
2847
Walter Dörwaldd2034312007-05-18 16:29:38 +00002848PyObject *
2849PyUnicode_FromFormatV(const char *format, va_list vargs)
2850{
Victor Stinnere215d962012-10-06 23:03:36 +02002851 va_list vargs2;
2852 const char *f;
2853 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854
Victor Stinner8f674cc2013-04-17 23:02:17 +02002855 _PyUnicodeWriter_Init(&writer);
2856 writer.min_length = strlen(format) + 100;
2857 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002858
2859 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2860 Copy it to be able to pass a reference to a subfunction. */
2861 Py_VA_COPY(vargs2, vargs);
2862
2863 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002865 f = unicode_fromformat_arg(&writer, f, &vargs2);
2866 if (f == NULL)
2867 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002870 const char *p;
2871 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinnere215d962012-10-06 23:03:36 +02002873 p = f;
2874 do
2875 {
2876 if ((unsigned char)*p > 127) {
2877 PyErr_Format(PyExc_ValueError,
2878 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2879 "string, got a non-ASCII byte: 0x%02x",
2880 (unsigned char)*p);
2881 return NULL;
2882 }
2883 p++;
2884 }
2885 while (*p != '\0' && *p != '%');
2886 len = p - f;
2887
2888 if (*p == '\0')
2889 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002890
2891 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002892 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002893
2894 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002895 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 return _PyUnicodeWriter_Finish(&writer);
2898
2899 fail:
2900 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002901 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002902}
2903
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904PyObject *
2905PyUnicode_FromFormat(const char *format, ...)
2906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 PyObject* ret;
2908 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002909
2910#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002912#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002914#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 ret = PyUnicode_FromFormatV(format, vargs);
2916 va_end(vargs);
2917 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918}
2919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920#ifdef HAVE_WCHAR_H
2921
Victor Stinner5593d8a2010-10-02 11:11:27 +00002922/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2923 convert a Unicode object to a wide character string.
2924
Victor Stinnerd88d9832011-09-06 02:00:05 +02002925 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002926 character) required to convert the unicode object. Ignore size argument.
2927
Victor Stinnerd88d9832011-09-06 02:00:05 +02002928 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002929 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002930 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002931static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002932unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002933 wchar_t *w,
2934 Py_ssize_t size)
2935{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002936 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 const wchar_t *wstr;
2938
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002939 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 if (wstr == NULL)
2941 return -1;
2942
Victor Stinner5593d8a2010-10-02 11:11:27 +00002943 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 if (size > res)
2945 size = res + 1;
2946 else
2947 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 return res;
2950 }
2951 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002953}
2954
2955Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002956PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002957 wchar_t *w,
2958 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959{
2960 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002961 PyErr_BadInternalCall();
2962 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002964 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965}
2966
Victor Stinner137c34c2010-09-29 10:25:54 +00002967wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002968PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002969 Py_ssize_t *size)
2970{
2971 wchar_t* buffer;
2972 Py_ssize_t buflen;
2973
2974 if (unicode == NULL) {
2975 PyErr_BadInternalCall();
2976 return NULL;
2977 }
2978
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002979 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002980 if (buflen == -1)
2981 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002982 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002983 if (buffer == NULL) {
2984 PyErr_NoMemory();
2985 return NULL;
2986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002988 if (buflen == -1) {
2989 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002990 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002991 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002992 if (size != NULL)
2993 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002994 return buffer;
2995}
2996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Alexander Belopolsky40018472011-02-26 01:02:56 +00002999PyObject *
3000PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003001{
Victor Stinner8faf8212011-12-08 22:14:11 +01003002 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 PyErr_SetString(PyExc_ValueError,
3004 "chr() arg not in range(0x110000)");
3005 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003006 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003007
Victor Stinner985a82a2014-01-03 12:53:47 +01003008 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003009}
3010
Alexander Belopolsky40018472011-02-26 01:02:56 +00003011PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003012PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003014 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003016 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003017 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003018 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 Py_INCREF(obj);
3020 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003021 }
3022 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 /* For a Unicode subtype that's not a Unicode object,
3024 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003025 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003026 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003027 PyErr_Format(PyExc_TypeError,
3028 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003029 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003030 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003031}
3032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003034PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003035 const char *encoding,
3036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003037{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003038 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003039 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 PyErr_BadInternalCall();
3043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003045
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003046 /* Decoding bytes objects is the most common case and should be fast */
3047 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003048 if (PyBytes_GET_SIZE(obj) == 0)
3049 _Py_RETURN_UNICODE_EMPTY();
3050 v = PyUnicode_Decode(
3051 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3052 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003053 return v;
3054 }
3055
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 PyErr_SetString(PyExc_TypeError,
3058 "decoding str is not supported");
3059 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003061
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003062 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3063 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3064 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02003065 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003066 Py_TYPE(obj)->tp_name);
3067 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003068 }
Tim Petersced69f82003-09-16 20:30:58 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 PyBuffer_Release(&buffer);
3072 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003074
Serhiy Storchaka05997252013-01-26 12:14:02 +02003075 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078}
3079
Victor Stinner600d3be2010-06-10 12:00:55 +00003080/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003081 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3082 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003083int
3084_Py_normalize_encoding(const char *encoding,
3085 char *lower,
3086 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003088 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003089 char *l;
3090 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003092 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003093 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003094 if (lower_len < 6)
3095 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003096 strcpy(lower, "utf-8");
3097 return 1;
3098 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003099 e = encoding;
3100 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003101 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003102 while (*e) {
3103 if (l == l_end)
3104 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003105 if (Py_ISUPPER(*e)) {
3106 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003107 }
3108 else if (*e == '_') {
3109 *l++ = '-';
3110 e++;
3111 }
3112 else {
3113 *l++ = *e++;
3114 }
3115 }
3116 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003117 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 Py_ssize_t size,
3123 const char *encoding,
3124 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003125{
3126 PyObject *buffer = NULL, *unicode;
3127 Py_buffer info;
3128 char lower[11]; /* Enough for any encoding shortcut */
3129
Fred Drakee4315f52000-05-09 19:53:39 +00003130 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003131 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003132 if ((strcmp(lower, "utf-8") == 0) ||
3133 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003134 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003135 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003136 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003137 (strcmp(lower, "iso-8859-1") == 0) ||
3138 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003139 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003140#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003141 else if (strcmp(lower, "mbcs") == 0)
3142 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003143#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003144 else if (strcmp(lower, "ascii") == 0)
3145 return PyUnicode_DecodeASCII(s, size, errors);
3146 else if (strcmp(lower, "utf-16") == 0)
3147 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3148 else if (strcmp(lower, "utf-32") == 0)
3149 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151
3152 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003153 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003154 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003155 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003156 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (buffer == NULL)
3158 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003159 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 if (unicode == NULL)
3161 goto onError;
3162 if (!PyUnicode_Check(unicode)) {
3163 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003164 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3165 "use codecs.decode() to decode to arbitrary types",
3166 encoding,
3167 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 Py_DECREF(unicode);
3169 goto onError;
3170 }
3171 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003172 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003173
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 Py_XDECREF(buffer);
3176 return NULL;
3177}
3178
Alexander Belopolsky40018472011-02-26 01:02:56 +00003179PyObject *
3180PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003181 const char *encoding,
3182 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003183{
3184 PyObject *v;
3185
3186 if (!PyUnicode_Check(unicode)) {
3187 PyErr_BadArgument();
3188 goto onError;
3189 }
3190
3191 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003193
3194 /* Decode via the codec registry */
3195 v = PyCodec_Decode(unicode, encoding, errors);
3196 if (v == NULL)
3197 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003198 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003201 return NULL;
3202}
3203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204PyObject *
3205PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003206 const char *encoding,
3207 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208{
3209 PyObject *v;
3210
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_BadArgument();
3213 goto onError;
3214 }
3215
3216 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218
3219 /* Decode via the codec registry */
3220 v = PyCodec_Decode(unicode, encoding, errors);
3221 if (v == NULL)
3222 goto onError;
3223 if (!PyUnicode_Check(v)) {
3224 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003225 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3226 "use codecs.decode() to decode to arbitrary types",
3227 encoding,
3228 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003229 Py_DECREF(v);
3230 goto onError;
3231 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003232 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003233
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235 return NULL;
3236}
3237
Alexander Belopolsky40018472011-02-26 01:02:56 +00003238PyObject *
3239PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003240 Py_ssize_t size,
3241 const char *encoding,
3242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243{
3244 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003245
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 unicode = PyUnicode_FromUnicode(s, size);
3247 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3250 Py_DECREF(unicode);
3251 return v;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 const char *encoding,
3257 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258{
3259 PyObject *v;
3260
3261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 goto onError;
3264 }
3265
3266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003268
3269 /* Encode via the codec registry */
3270 v = PyCodec_Encode(unicode, encoding, errors);
3271 if (v == NULL)
3272 goto onError;
3273 return v;
3274
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003276 return NULL;
3277}
3278
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279static size_t
3280wcstombs_errorpos(const wchar_t *wstr)
3281{
3282 size_t len;
3283#if SIZEOF_WCHAR_T == 2
3284 wchar_t buf[3];
3285#else
3286 wchar_t buf[2];
3287#endif
3288 char outbuf[MB_LEN_MAX];
3289 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291#if SIZEOF_WCHAR_T == 2
3292 buf[2] = 0;
3293#else
3294 buf[1] = 0;
3295#endif
3296 start = wstr;
3297 while (*wstr != L'\0')
3298 {
3299 previous = wstr;
3300#if SIZEOF_WCHAR_T == 2
3301 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3302 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3303 {
3304 buf[0] = wstr[0];
3305 buf[1] = wstr[1];
3306 wstr += 2;
3307 }
3308 else {
3309 buf[0] = *wstr;
3310 buf[1] = 0;
3311 wstr++;
3312 }
3313#else
3314 buf[0] = *wstr;
3315 wstr++;
3316#endif
3317 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003318 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003319 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320 }
3321
3322 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003323 return 0;
3324}
3325
Victor Stinner1b579672011-12-17 05:47:23 +01003326static int
3327locale_error_handler(const char *errors, int *surrogateescape)
3328{
Victor Stinner50149202015-09-22 00:26:54 +02003329 _Py_error_handler error_handler = get_error_handler(errors);
3330 switch (error_handler)
3331 {
3332 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003333 *surrogateescape = 0;
3334 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003335 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003336 *surrogateescape = 1;
3337 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003338 default:
3339 PyErr_Format(PyExc_ValueError,
3340 "only 'strict' and 'surrogateescape' error handlers "
3341 "are supported, not '%s'",
3342 errors);
3343 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003344 }
Victor Stinner1b579672011-12-17 05:47:23 +01003345}
3346
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003347PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003348PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003349{
3350 Py_ssize_t wlen, wlen2;
3351 wchar_t *wstr;
3352 PyObject *bytes = NULL;
3353 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003354 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003355 PyObject *exc;
3356 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003357 int surrogateescape;
3358
3359 if (locale_error_handler(errors, &surrogateescape) < 0)
3360 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003361
3362 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3363 if (wstr == NULL)
3364 return NULL;
3365
3366 wlen2 = wcslen(wstr);
3367 if (wlen2 != wlen) {
3368 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003369 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 return NULL;
3371 }
3372
3373 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003374 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375 char *str;
3376
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003377 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 if (str == NULL) {
3379 if (error_pos == (size_t)-1) {
3380 PyErr_NoMemory();
3381 PyMem_Free(wstr);
3382 return NULL;
3383 }
3384 else {
3385 goto encode_error;
3386 }
3387 }
3388 PyMem_Free(wstr);
3389
3390 bytes = PyBytes_FromString(str);
3391 PyMem_Free(str);
3392 }
3393 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003394 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003395 size_t len, len2;
3396
3397 len = wcstombs(NULL, wstr, 0);
3398 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003399 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003400 goto encode_error;
3401 }
3402
3403 bytes = PyBytes_FromStringAndSize(NULL, len);
3404 if (bytes == NULL) {
3405 PyMem_Free(wstr);
3406 return NULL;
3407 }
3408
3409 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3410 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003411 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412 goto encode_error;
3413 }
3414 PyMem_Free(wstr);
3415 }
3416 return bytes;
3417
3418encode_error:
3419 errmsg = strerror(errno);
3420 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003421
3422 if (error_pos == (size_t)-1)
3423 error_pos = wcstombs_errorpos(wstr);
3424
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 PyMem_Free(wstr);
3426 Py_XDECREF(bytes);
3427
Victor Stinner2f197072011-12-17 07:08:30 +01003428 if (errmsg != NULL) {
3429 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003430 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003431 if (wstr != NULL) {
3432 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003433 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003434 } else
3435 errmsg = NULL;
3436 }
3437 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003438 reason = PyUnicode_FromString(
3439 "wcstombs() encountered an unencodable "
3440 "wide character");
3441 if (reason == NULL)
3442 return NULL;
3443
3444 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3445 "locale", unicode,
3446 (Py_ssize_t)error_pos,
3447 (Py_ssize_t)(error_pos+1),
3448 reason);
3449 Py_DECREF(reason);
3450 if (exc != NULL) {
3451 PyCodec_StrictErrors(exc);
3452 Py_XDECREF(exc);
3453 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003454 return NULL;
3455}
3456
Victor Stinnerad158722010-10-27 00:25:46 +00003457PyObject *
3458PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003459{
Victor Stinner99b95382011-07-04 14:23:54 +02003460#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003461 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003462#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003464#else
Victor Stinner793b5312011-04-27 00:24:21 +02003465 PyInterpreterState *interp = PyThreadState_GET()->interp;
3466 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3467 cannot use it to encode and decode filenames before it is loaded. Load
3468 the Python codec requires to encode at least its own filename. Use the C
3469 version of the locale codec until the codec registry is initialized and
3470 the Python codec is loaded.
3471
3472 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3473 cannot only rely on it: check also interp->fscodec_initialized for
3474 subinterpreters. */
3475 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003476 return PyUnicode_AsEncodedString(unicode,
3477 Py_FileSystemDefaultEncoding,
3478 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003479 }
3480 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003481 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003482 }
Victor Stinnerad158722010-10-27 00:25:46 +00003483#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003484}
3485
Alexander Belopolsky40018472011-02-26 01:02:56 +00003486PyObject *
3487PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003488 const char *encoding,
3489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
3491 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003492 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 if (!PyUnicode_Check(unicode)) {
3495 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 }
Fred Drakee4315f52000-05-09 19:53:39 +00003498
Fred Drakee4315f52000-05-09 19:53:39 +00003499 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003501 if ((strcmp(lower, "utf-8") == 0) ||
3502 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003503 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003504 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003506 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003507 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003508 }
Victor Stinner37296e82010-06-10 13:36:23 +00003509 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003510 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003511 (strcmp(lower, "iso-8859-1") == 0) ||
3512 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003514#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003515 else if (strcmp(lower, "mbcs") == 0)
3516 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003517#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003518 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521
3522 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003523 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003525 return NULL;
3526
3527 /* The normal path */
3528 if (PyBytes_Check(v))
3529 return v;
3530
3531 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003532 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003533 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003534 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003535
3536 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003537 "encoder %s returned bytearray instead of bytes; "
3538 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003539 encoding);
3540 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003541 Py_DECREF(v);
3542 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003543 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003545 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3546 Py_DECREF(v);
3547 return b;
3548 }
3549
3550 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003551 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3552 "use codecs.encode() to encode to arbitrary types",
3553 encoding,
3554 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003555 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003556 return NULL;
3557}
3558
Alexander Belopolsky40018472011-02-26 01:02:56 +00003559PyObject *
3560PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003561 const char *encoding,
3562 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003563{
3564 PyObject *v;
3565
3566 if (!PyUnicode_Check(unicode)) {
3567 PyErr_BadArgument();
3568 goto onError;
3569 }
3570
3571 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003573
3574 /* Encode via the codec registry */
3575 v = PyCodec_Encode(unicode, encoding, errors);
3576 if (v == NULL)
3577 goto onError;
3578 if (!PyUnicode_Check(v)) {
3579 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003580 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3581 "use codecs.encode() to encode to arbitrary types",
3582 encoding,
3583 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003584 Py_DECREF(v);
3585 goto onError;
3586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003588
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 return NULL;
3591}
3592
Victor Stinner2f197072011-12-17 07:08:30 +01003593static size_t
3594mbstowcs_errorpos(const char *str, size_t len)
3595{
3596#ifdef HAVE_MBRTOWC
3597 const char *start = str;
3598 mbstate_t mbs;
3599 size_t converted;
3600 wchar_t ch;
3601
3602 memset(&mbs, 0, sizeof mbs);
3603 while (len)
3604 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003605 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003606 if (converted == 0)
3607 /* Reached end of string */
3608 break;
3609 if (converted == (size_t)-1 || converted == (size_t)-2) {
3610 /* Conversion error or incomplete character */
3611 return str - start;
3612 }
3613 else {
3614 str += converted;
3615 len -= converted;
3616 }
3617 }
3618 /* failed to find the undecodable byte sequence */
3619 return 0;
3620#endif
3621 return 0;
3622}
3623
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003625PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003626 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003627{
3628 wchar_t smallbuf[256];
3629 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3630 wchar_t *wstr;
3631 size_t wlen, wlen2;
3632 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003633 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003634 size_t error_pos;
3635 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003636 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3637 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003638
3639 if (locale_error_handler(errors, &surrogateescape) < 0)
3640 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003642 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3643 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003644 return NULL;
3645 }
3646
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003647 if (surrogateescape) {
3648 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003649 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003650 if (wstr == NULL) {
3651 if (wlen == (size_t)-1)
3652 PyErr_NoMemory();
3653 else
3654 PyErr_SetFromErrno(PyExc_OSError);
3655 return NULL;
3656 }
3657
3658 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003659 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003660 }
3661 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003662 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003663#ifndef HAVE_BROKEN_MBSTOWCS
3664 wlen = mbstowcs(NULL, str, 0);
3665#else
3666 wlen = len;
3667#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003668 if (wlen == (size_t)-1)
3669 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003670 if (wlen+1 <= smallbuf_len) {
3671 wstr = smallbuf;
3672 }
3673 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003674 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003675 if (!wstr)
3676 return PyErr_NoMemory();
3677 }
3678
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679 wlen2 = mbstowcs(wstr, str, wlen+1);
3680 if (wlen2 == (size_t)-1) {
3681 if (wstr != smallbuf)
3682 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003683 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003684 }
3685#ifdef HAVE_BROKEN_MBSTOWCS
3686 assert(wlen2 == wlen);
3687#endif
3688 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3689 if (wstr != smallbuf)
3690 PyMem_Free(wstr);
3691 }
3692 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003693
3694decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003695 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003696 errmsg = strerror(errno);
3697 assert(errmsg != NULL);
3698
3699 error_pos = mbstowcs_errorpos(str, len);
3700 if (errmsg != NULL) {
3701 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003702 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003703 if (wstr != NULL) {
3704 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003705 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003706 }
Victor Stinner2f197072011-12-17 07:08:30 +01003707 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003708 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003709 reason = PyUnicode_FromString(
3710 "mbstowcs() encountered an invalid multibyte sequence");
3711 if (reason == NULL)
3712 return NULL;
3713
3714 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3715 "locale", str, len,
3716 (Py_ssize_t)error_pos,
3717 (Py_ssize_t)(error_pos+1),
3718 reason);
3719 Py_DECREF(reason);
3720 if (exc != NULL) {
3721 PyCodec_StrictErrors(exc);
3722 Py_XDECREF(exc);
3723 }
3724 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003725}
3726
3727PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003728PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003729{
3730 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003731 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732}
3733
3734
3735PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003736PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003737 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003738 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3739}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003740
Christian Heimes5894ba72007-11-04 11:43:14 +00003741PyObject*
3742PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3743{
Victor Stinner99b95382011-07-04 14:23:54 +02003744#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003745 return PyUnicode_DecodeMBCS(s, size, NULL);
3746#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003747 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003748#else
Victor Stinner793b5312011-04-27 00:24:21 +02003749 PyInterpreterState *interp = PyThreadState_GET()->interp;
3750 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3751 cannot use it to encode and decode filenames before it is loaded. Load
3752 the Python codec requires to encode at least its own filename. Use the C
3753 version of the locale codec until the codec registry is initialized and
3754 the Python codec is loaded.
3755
3756 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3757 cannot only rely on it: check also interp->fscodec_initialized for
3758 subinterpreters. */
3759 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003760 return PyUnicode_Decode(s, size,
3761 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003762 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003763 }
3764 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003765 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003766 }
Victor Stinnerad158722010-10-27 00:25:46 +00003767#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003768}
3769
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770
3771int
3772PyUnicode_FSConverter(PyObject* arg, void* addr)
3773{
3774 PyObject *output = NULL;
3775 Py_ssize_t size;
3776 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003777 if (arg == NULL) {
3778 Py_DECREF(*(PyObject**)addr);
3779 return 1;
3780 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003781 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 output = arg;
3783 Py_INCREF(output);
3784 }
3785 else {
3786 arg = PyUnicode_FromObject(arg);
3787 if (!arg)
3788 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003789 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003790 Py_DECREF(arg);
3791 if (!output)
3792 return 0;
3793 if (!PyBytes_Check(output)) {
3794 Py_DECREF(output);
3795 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3796 return 0;
3797 }
3798 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003799 size = PyBytes_GET_SIZE(output);
3800 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003801 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003802 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003803 Py_DECREF(output);
3804 return 0;
3805 }
3806 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003807 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003808}
3809
3810
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003811int
3812PyUnicode_FSDecoder(PyObject* arg, void* addr)
3813{
3814 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003815 if (arg == NULL) {
3816 Py_DECREF(*(PyObject**)addr);
3817 return 1;
3818 }
3819 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003820 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003822 output = arg;
3823 Py_INCREF(output);
3824 }
3825 else {
3826 arg = PyBytes_FromObject(arg);
3827 if (!arg)
3828 return 0;
3829 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3830 PyBytes_GET_SIZE(arg));
3831 Py_DECREF(arg);
3832 if (!output)
3833 return 0;
3834 if (!PyUnicode_Check(output)) {
3835 Py_DECREF(output);
3836 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3837 return 0;
3838 }
3839 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003840 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003841 Py_DECREF(output);
3842 return 0;
3843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003845 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003846 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003847 Py_DECREF(output);
3848 return 0;
3849 }
3850 *(PyObject**)addr = output;
3851 return Py_CLEANUP_SUPPORTED;
3852}
3853
3854
Martin v. Löwis5b222132007-06-10 09:51:05 +00003855char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003857{
Christian Heimesf3863112007-11-22 07:46:41 +00003858 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003860 if (!PyUnicode_Check(unicode)) {
3861 PyErr_BadArgument();
3862 return NULL;
3863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003865 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003867 if (PyUnicode_UTF8(unicode) == NULL) {
3868 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3870 if (bytes == NULL)
3871 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3873 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003874 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 Py_DECREF(bytes);
3876 return NULL;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3879 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3880 PyBytes_AS_STRING(bytes),
3881 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 Py_DECREF(bytes);
3883 }
3884
3885 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003886 *psize = PyUnicode_UTF8_LENGTH(unicode);
3887 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003888}
3889
3890char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3894}
3895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896Py_UNICODE *
3897PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 const unsigned char *one_byte;
3900#if SIZEOF_WCHAR_T == 4
3901 const Py_UCS2 *two_bytes;
3902#else
3903 const Py_UCS4 *four_bytes;
3904 const Py_UCS4 *ucs4_end;
3905 Py_ssize_t num_surrogates;
3906#endif
3907 wchar_t *w;
3908 wchar_t *wchar_end;
3909
3910 if (!PyUnicode_Check(unicode)) {
3911 PyErr_BadArgument();
3912 return NULL;
3913 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 assert(_PyUnicode_KIND(unicode) != 0);
3917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003919 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3922 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 num_surrogates = 0;
3924
3925 for (; four_bytes < ucs4_end; ++four_bytes) {
3926 if (*four_bytes > 0xFFFF)
3927 ++num_surrogates;
3928 }
3929
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003930 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3931 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3932 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 PyErr_NoMemory();
3934 return NULL;
3935 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003938 w = _PyUnicode_WSTR(unicode);
3939 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3940 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3942 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003943 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003945 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3946 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 }
3948 else
3949 *w = *four_bytes;
3950
3951 if (w > wchar_end) {
3952 assert(0 && "Miscalculated string end");
3953 }
3954 }
3955 *w = 0;
3956#else
3957 /* sizeof(wchar_t) == 4 */
3958 Py_FatalError("Impossible unicode object state, wstr and str "
3959 "should share memory already.");
3960 return NULL;
3961#endif
3962 }
3963 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003964 if ((size_t)_PyUnicode_LENGTH(unicode) >
3965 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3966 PyErr_NoMemory();
3967 return NULL;
3968 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003969 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3970 (_PyUnicode_LENGTH(unicode) + 1));
3971 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 PyErr_NoMemory();
3973 return NULL;
3974 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003975 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3976 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3977 w = _PyUnicode_WSTR(unicode);
3978 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3981 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 for (; w < wchar_end; ++one_byte, ++w)
3983 *w = *one_byte;
3984 /* null-terminate the wstr */
3985 *w = 0;
3986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003987 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 for (; w < wchar_end; ++two_bytes, ++w)
3991 *w = *two_bytes;
3992 /* null-terminate the wstr */
3993 *w = 0;
3994#else
3995 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 PyObject_FREE(_PyUnicode_WSTR(unicode));
3997 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 Py_FatalError("Impossible unicode object state, wstr "
3999 "and str should share memory already.");
4000 return NULL;
4001#endif
4002 }
4003 else {
4004 assert(0 && "This should never happen.");
4005 }
4006 }
4007 }
4008 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 *size = PyUnicode_WSTR_LENGTH(unicode);
4010 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004011}
4012
Alexander Belopolsky40018472011-02-26 01:02:56 +00004013Py_UNICODE *
4014PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017}
4018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019
Alexander Belopolsky40018472011-02-26 01:02:56 +00004020Py_ssize_t
4021PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022{
4023 if (!PyUnicode_Check(unicode)) {
4024 PyErr_BadArgument();
4025 goto onError;
4026 }
4027 return PyUnicode_GET_SIZE(unicode);
4028
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 return -1;
4031}
4032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033Py_ssize_t
4034PyUnicode_GetLength(PyObject *unicode)
4035{
Victor Stinner07621332012-06-16 04:53:46 +02004036 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 PyErr_BadArgument();
4038 return -1;
4039 }
Victor Stinner07621332012-06-16 04:53:46 +02004040 if (PyUnicode_READY(unicode) == -1)
4041 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return PyUnicode_GET_LENGTH(unicode);
4043}
4044
4045Py_UCS4
4046PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4047{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004048 void *data;
4049 int kind;
4050
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004051 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4052 PyErr_BadArgument();
4053 return (Py_UCS4)-1;
4054 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004055 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004056 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 return (Py_UCS4)-1;
4058 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004059 data = PyUnicode_DATA(unicode);
4060 kind = PyUnicode_KIND(unicode);
4061 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062}
4063
4064int
4065PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4066{
4067 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004068 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 return -1;
4070 }
Victor Stinner488fa492011-12-12 00:01:39 +01004071 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004072 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004073 PyErr_SetString(PyExc_IndexError, "string index out of range");
4074 return -1;
4075 }
Victor Stinner488fa492011-12-12 00:01:39 +01004076 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004077 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004078 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4079 PyErr_SetString(PyExc_ValueError, "character out of range");
4080 return -1;
4081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4083 index, ch);
4084 return 0;
4085}
4086
Alexander Belopolsky40018472011-02-26 01:02:56 +00004087const char *
4088PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004089{
Victor Stinner42cb4622010-09-01 19:39:01 +00004090 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004091}
4092
Victor Stinner554f3f02010-06-16 23:33:54 +00004093/* create or adjust a UnicodeDecodeError */
4094static void
4095make_decode_exception(PyObject **exceptionObject,
4096 const char *encoding,
4097 const char *input, Py_ssize_t length,
4098 Py_ssize_t startpos, Py_ssize_t endpos,
4099 const char *reason)
4100{
4101 if (*exceptionObject == NULL) {
4102 *exceptionObject = PyUnicodeDecodeError_Create(
4103 encoding, input, length, startpos, endpos, reason);
4104 }
4105 else {
4106 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4107 goto onError;
4108 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4109 goto onError;
4110 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4111 goto onError;
4112 }
4113 return;
4114
4115onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004116 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004117}
4118
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004119#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120/* error handling callback helper:
4121 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004122 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 and adjust various state variables.
4124 return 0 on success, -1 on error
4125*/
4126
Alexander Belopolsky40018472011-02-26 01:02:56 +00004127static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128unicode_decode_call_errorhandler_wchar(
4129 const char *errors, PyObject **errorHandler,
4130 const char *encoding, const char *reason,
4131 const char **input, const char **inend, Py_ssize_t *startinpos,
4132 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4133 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004135 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136
4137 PyObject *restuple = NULL;
4138 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004139 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004141 Py_ssize_t requiredsize;
4142 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144 wchar_t *repwstr;
4145 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004147 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4148 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 *errorHandler = PyCodec_LookupError(errors);
4152 if (*errorHandler == NULL)
4153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 }
4155
Victor Stinner554f3f02010-06-16 23:33:54 +00004156 make_decode_exception(exceptionObject,
4157 encoding,
4158 *input, *inend - *input,
4159 *startinpos, *endinpos,
4160 reason);
4161 if (*exceptionObject == NULL)
4162 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163
4164 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4165 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004168 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 }
4171 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173
4174 /* Copy back the bytes variables, which might have been modified by the
4175 callback */
4176 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4177 if (!inputobj)
4178 goto onError;
4179 if (!PyBytes_Check(inputobj)) {
4180 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4181 }
4182 *input = PyBytes_AS_STRING(inputobj);
4183 insize = PyBytes_GET_SIZE(inputobj);
4184 *inend = *input + insize;
4185 /* we can DECREF safely, as the exception has another reference,
4186 so the object won't go away. */
4187 Py_DECREF(inputobj);
4188
4189 if (newpos<0)
4190 newpos = insize+newpos;
4191 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004192 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 goto onError;
4194 }
4195
4196 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4197 if (repwstr == NULL)
4198 goto onError;
4199 /* need more space? (at least enough for what we
4200 have+the replacement+the rest of the string (starting
4201 at the new input position), so we won't have to check space
4202 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004203 requiredsize = *outpos;
4204 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4205 goto overflow;
4206 requiredsize += repwlen;
4207 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4208 goto overflow;
4209 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004211 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004212 requiredsize = 2*outsize;
4213 if (unicode_resize(output, requiredsize) < 0)
4214 goto onError;
4215 }
4216 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4217 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218 *endinpos = newpos;
4219 *inptr = *input + newpos;
4220
4221 /* we made it! */
4222 Py_XDECREF(restuple);
4223 return 0;
4224
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004225 overflow:
4226 PyErr_SetString(PyExc_OverflowError,
4227 "decoded result is too long for a Python string");
4228
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 onError:
4230 Py_XDECREF(restuple);
4231 return -1;
4232}
4233#endif /* HAVE_MBCS */
4234
4235static int
4236unicode_decode_call_errorhandler_writer(
4237 const char *errors, PyObject **errorHandler,
4238 const char *encoding, const char *reason,
4239 const char **input, const char **inend, Py_ssize_t *startinpos,
4240 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4241 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4242{
4243 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4244
4245 PyObject *restuple = NULL;
4246 PyObject *repunicode = NULL;
4247 Py_ssize_t insize;
4248 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004249 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250 PyObject *inputobj = NULL;
4251
4252 if (*errorHandler == NULL) {
4253 *errorHandler = PyCodec_LookupError(errors);
4254 if (*errorHandler == NULL)
4255 goto onError;
4256 }
4257
4258 make_decode_exception(exceptionObject,
4259 encoding,
4260 *input, *inend - *input,
4261 *startinpos, *endinpos,
4262 reason);
4263 if (*exceptionObject == NULL)
4264 goto onError;
4265
4266 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4267 if (restuple == NULL)
4268 goto onError;
4269 if (!PyTuple_Check(restuple)) {
4270 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4271 goto onError;
4272 }
4273 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004275
4276 /* Copy back the bytes variables, which might have been modified by the
4277 callback */
4278 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4279 if (!inputobj)
4280 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004281 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004283 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004284 *input = PyBytes_AS_STRING(inputobj);
4285 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004286 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004287 /* we can DECREF safely, as the exception has another reference,
4288 so the object won't go away. */
4289 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004293 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004294 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297
Victor Stinner8f674cc2013-04-17 23:02:17 +02004298 if (PyUnicode_READY(repunicode) < 0)
4299 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004300 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004301 if (replen > 1) {
4302 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004303 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004304 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4305 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4306 goto onError;
4307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004309 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004312 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 Py_XDECREF(restuple);
4316 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321}
4322
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323/* --- UTF-7 Codec -------------------------------------------------------- */
4324
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325/* See RFC2152 for details. We encode conservatively and decode liberally. */
4326
4327/* Three simple macros defining base-64. */
4328
4329/* Is c a base-64 character? */
4330
4331#define IS_BASE64(c) \
4332 (((c) >= 'A' && (c) <= 'Z') || \
4333 ((c) >= 'a' && (c) <= 'z') || \
4334 ((c) >= '0' && (c) <= '9') || \
4335 (c) == '+' || (c) == '/')
4336
4337/* given that c is a base-64 character, what is its base-64 value? */
4338
4339#define FROM_BASE64(c) \
4340 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4341 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4342 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4343 (c) == '+' ? 62 : 63)
4344
4345/* What is the base-64 character of the bottom 6 bits of n? */
4346
4347#define TO_BASE64(n) \
4348 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4349
4350/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4351 * decoded as itself. We are permissive on decoding; the only ASCII
4352 * byte not decoding to itself is the + which begins a base64
4353 * string. */
4354
4355#define DECODE_DIRECT(c) \
4356 ((c) <= 127 && (c) != '+')
4357
4358/* The UTF-7 encoder treats ASCII characters differently according to
4359 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4360 * the above). See RFC2152. This array identifies these different
4361 * sets:
4362 * 0 : "Set D"
4363 * alphanumeric and '(),-./:?
4364 * 1 : "Set O"
4365 * !"#$%&*;<=>@[]^_`{|}
4366 * 2 : "whitespace"
4367 * ht nl cr sp
4368 * 3 : special (must be base64 encoded)
4369 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4370 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371
Tim Petersced69f82003-09-16 20:30:58 +00004372static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373char utf7_category[128] = {
4374/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4375 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4376/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4377 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4378/* sp ! " # $ % & ' ( ) * + , - . / */
4379 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4380/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4382/* @ A B C D E F G H I J K L M N O */
4383 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4384/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4386/* ` a b c d e f g h i j k l m n o */
4387 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4388/* p q r s t u v w x y z { | } ~ del */
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390};
4391
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392/* ENCODE_DIRECT: this character should be encoded as itself. The
4393 * answer depends on whether we are encoding set O as itself, and also
4394 * on whether we are encoding whitespace as itself. RFC2152 makes it
4395 * clear that the answers to these questions vary between
4396 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004397
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398#define ENCODE_DIRECT(c, directO, directWS) \
4399 ((c) < 128 && (c) > 0 && \
4400 ((utf7_category[(c)] == 0) || \
4401 (directWS && (utf7_category[(c)] == 2)) || \
4402 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403
Alexander Belopolsky40018472011-02-26 01:02:56 +00004404PyObject *
4405PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004406 Py_ssize_t size,
4407 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004409 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4410}
4411
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412/* The decoder. The only state we preserve is our read position,
4413 * i.e. how many characters we have consumed. So if we end in the
4414 * middle of a shift sequence we have to back off the read position
4415 * and the output to the beginning of the sequence, otherwise we lose
4416 * all the shift state (seen bits, number of bits seen, high
4417 * surrogate). */
4418
Alexander Belopolsky40018472011-02-26 01:02:56 +00004419PyObject *
4420PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004421 Py_ssize_t size,
4422 const char *errors,
4423 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t startinpos;
4427 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 const char *errmsg = "";
4431 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 unsigned int base64bits = 0;
4434 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 PyObject *errorHandler = NULL;
4437 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004439 if (size == 0) {
4440 if (consumed)
4441 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004442 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004446 _PyUnicodeWriter_Init(&writer);
4447 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448
4449 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450 e = s + size;
4451
4452 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004453 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004455 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 if (inShift) { /* in a base-64 section */
4458 if (IS_BASE64(ch)) { /* consume a base-64 character */
4459 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4460 base64bits += 6;
4461 s++;
4462 if (base64bits >= 16) {
4463 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004464 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 base64bits -= 16;
4466 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004467 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 if (surrogate) {
4469 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004470 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4471 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004472 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004475 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 }
4477 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004478 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004479 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 }
4482 }
Victor Stinner551ac952011-11-29 22:58:13 +01004483 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 /* first surrogate */
4485 surrogate = outCh;
4486 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004488 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 }
4491 }
4492 }
4493 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 if (base64bits > 0) { /* left-over bits */
4496 if (base64bits >= 6) {
4497 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004498 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 errmsg = "partial character in shift sequence";
4500 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 else {
4503 /* Some bits remain; they should be zero */
4504 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004505 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 errmsg = "non-zero padding bits in shift sequence";
4507 goto utf7Error;
4508 }
4509 }
4510 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004511 if (surrogate && DECODE_DIRECT(ch)) {
4512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4513 goto onError;
4514 }
4515 surrogate = 0;
4516 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 /* '-' is absorbed; other terminating
4518 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004519 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 }
4522 }
4523 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 s++; /* consume '+' */
4526 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004528 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004529 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530 }
4531 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004533 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004536 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
4538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004541 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004542 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 else {
4545 startinpos = s-starts;
4546 s++;
4547 errmsg = "unexpected special character";
4548 goto utf7Error;
4549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004553 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 errors, &errorHandler,
4555 "utf7", errmsg,
4556 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 }
4560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* end of string */
4562
4563 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4564 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004565 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (surrogate ||
4567 (base64bits >= 6) ||
4568 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004570 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 errors, &errorHandler,
4572 "utf7", "unterminated shift sequence",
4573 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 goto onError;
4576 if (s < e)
4577 goto restart;
4578 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580
4581 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004582 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004584 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004585 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004586 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004587 writer.kind, writer.data, shiftOutStart);
4588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
4590 _PyUnicodeWriter_Dealloc(&writer);
4591 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004592 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004593 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004596 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004598 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 Py_XDECREF(errorHandler);
4601 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 Py_XDECREF(errorHandler);
4606 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004607 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 return NULL;
4609}
4610
4611
Alexander Belopolsky40018472011-02-26 01:02:56 +00004612PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004613_PyUnicode_EncodeUTF7(PyObject *str,
4614 int base64SetO,
4615 int base64WhiteSpace,
4616 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004618 int kind;
4619 void *data;
4620 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004623 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 unsigned int base64bits = 0;
4625 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 char * out;
4627 char * start;
4628
Benjamin Petersonbac79492012-01-14 13:34:47 -05004629 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004630 return NULL;
4631 kind = PyUnicode_KIND(str);
4632 data = PyUnicode_DATA(str);
4633 len = PyUnicode_GET_LENGTH(str);
4634
4635 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004638 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004639 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004640 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004641 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 if (v == NULL)
4643 return NULL;
4644
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004645 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004646 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004647 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 if (inShift) {
4650 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 /* shifting out */
4652 if (base64bits) { /* output remaining bits */
4653 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4654 base64buffer = 0;
4655 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
4657 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 /* Characters not in the BASE64 set implicitly unshift the sequence
4659 so no '-' is required, except if the character is itself a '-' */
4660 if (IS_BASE64(ch) || ch == '-') {
4661 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 *out++ = (char) ch;
4664 }
4665 else {
4666 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004667 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else { /* not in a shift sequence */
4670 if (ch == '+') {
4671 *out++ = '+';
4672 *out++ = '-';
4673 }
4674 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4675 *out++ = (char) ch;
4676 }
4677 else {
4678 *out++ = '+';
4679 inShift = 1;
4680 goto encode_char;
4681 }
4682 }
4683 continue;
4684encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004686 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 /* code first surrogate */
4689 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004690 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 while (base64bits >= 6) {
4692 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4693 base64bits -= 6;
4694 }
4695 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004696 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 base64bits += 16;
4699 base64buffer = (base64buffer << 16) | ch;
4700 while (base64bits >= 6) {
4701 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4702 base64bits -= 6;
4703 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004704 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 if (base64bits)
4706 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4707 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004709 if (_PyBytes_Resize(&v, out - start) < 0)
4710 return NULL;
4711 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004713PyObject *
4714PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4715 Py_ssize_t size,
4716 int base64SetO,
4717 int base64WhiteSpace,
4718 const char *errors)
4719{
4720 PyObject *result;
4721 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4722 if (tmp == NULL)
4723 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004724 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004725 base64WhiteSpace, errors);
4726 Py_DECREF(tmp);
4727 return result;
4728}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730#undef IS_BASE64
4731#undef FROM_BASE64
4732#undef TO_BASE64
4733#undef DECODE_DIRECT
4734#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736/* --- UTF-8 Codec -------------------------------------------------------- */
4737
Alexander Belopolsky40018472011-02-26 01:02:56 +00004738PyObject *
4739PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004740 Py_ssize_t size,
4741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742{
Walter Dörwald69652032004-09-07 20:24:22 +00004743 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4744}
4745
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004746#include "stringlib/asciilib.h"
4747#include "stringlib/codecs.h"
4748#include "stringlib/undef.h"
4749
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004750#include "stringlib/ucs1lib.h"
4751#include "stringlib/codecs.h"
4752#include "stringlib/undef.h"
4753
4754#include "stringlib/ucs2lib.h"
4755#include "stringlib/codecs.h"
4756#include "stringlib/undef.h"
4757
4758#include "stringlib/ucs4lib.h"
4759#include "stringlib/codecs.h"
4760#include "stringlib/undef.h"
4761
Antoine Pitrouab868312009-01-10 15:40:25 +00004762/* Mask to quickly check whether a C 'long' contains a
4763 non-ASCII, UTF8-encoded char. */
4764#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004765# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004766#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004767# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004768#else
4769# error C 'long' size should be either 4 or 8!
4770#endif
4771
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772static Py_ssize_t
4773ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004776 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004778 /*
4779 * Issue #17237: m68k is a bit different from most architectures in
4780 * that objects do not use "natural alignment" - for example, int and
4781 * long are only aligned at 2-byte boundaries. Therefore the assert()
4782 * won't work; also, tests have shown that skipping the "optimised
4783 * version" will even speed up m68k.
4784 */
4785#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004787 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4788 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 /* Fast path, see in STRINGLIB(utf8_decode) for
4790 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004791 /* Help allocation */
4792 const char *_p = p;
4793 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(const unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 *((unsigned long *)q) = value;
4799 _p += SIZEOF_LONG;
4800 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 p = _p;
4803 while (p < end) {
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004811#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004812 while (p < end) {
4813 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4814 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004816 /* Help allocation */
4817 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 while (_p < aligned_end) {
4819 unsigned long value = *(unsigned long *) _p;
4820 if (value & ASCII_CHAR_MASK)
4821 break;
4822 _p += SIZEOF_LONG;
4823 }
4824 p = _p;
4825 if (_p == end)
4826 break;
4827 }
4828 if ((unsigned char)*p & 0x80)
4829 break;
4830 ++p;
4831 }
4832 memcpy(dest, start, p - start);
4833 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
Antoine Pitrouab868312009-01-10 15:40:25 +00004835
Victor Stinner785938e2011-12-11 20:09:03 +01004836PyObject *
4837PyUnicode_DecodeUTF8Stateful(const char *s,
4838 Py_ssize_t size,
4839 const char *errors,
4840 Py_ssize_t *consumed)
4841{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004842 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004843 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845
4846 Py_ssize_t startinpos;
4847 Py_ssize_t endinpos;
4848 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004849 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004851 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004852
4853 if (size == 0) {
4854 if (consumed)
4855 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004856 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004857 }
4858
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4860 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004861 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 *consumed = 1;
4863 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004864 }
4865
Victor Stinner8f674cc2013-04-17 23:02:17 +02004866 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004867 writer.min_length = size;
4868 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004869 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004870
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 writer.pos = ascii_decode(s, end, writer.data);
4872 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 while (s < end) {
4874 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004875 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004876
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004878 if (PyUnicode_IS_ASCII(writer.buffer))
4879 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004881 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004883 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 } else {
4885 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 }
4888
4889 switch (ch) {
4890 case 0:
4891 if (s == end || consumed)
4892 goto End;
4893 errmsg = "unexpected end of data";
4894 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004895 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 break;
4897 case 1:
4898 errmsg = "invalid start byte";
4899 startinpos = s - starts;
4900 endinpos = startinpos + 1;
4901 break;
4902 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004903 case 3:
4904 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 errmsg = "invalid continuation byte";
4906 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004907 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 break;
4909 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004910 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 goto onError;
4912 continue;
4913 }
4914
Victor Stinner1d65d912015-10-05 13:43:50 +02004915 if (error_handler == _Py_ERROR_UNKNOWN)
4916 error_handler = get_error_handler(errors);
4917
4918 switch (error_handler) {
4919 case _Py_ERROR_IGNORE:
4920 s += (endinpos - startinpos);
4921 break;
4922
4923 case _Py_ERROR_REPLACE:
4924 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4925 goto onError;
4926 s += (endinpos - startinpos);
4927 break;
4928
4929 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004930 {
4931 Py_ssize_t i;
4932
Victor Stinner1d65d912015-10-05 13:43:50 +02004933 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4934 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004935 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004936 ch = (Py_UCS4)(unsigned char)(starts[i]);
4937 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4938 ch + 0xdc00);
4939 writer.pos++;
4940 }
4941 s += (endinpos - startinpos);
4942 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004943 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004944
4945 default:
4946 if (unicode_decode_call_errorhandler_writer(
4947 errors, &error_handler_obj,
4948 "utf-8", errmsg,
4949 &starts, &end, &startinpos, &endinpos, &exc, &s,
4950 &writer))
4951 goto onError;
4952 }
Victor Stinner785938e2011-12-11 20:09:03 +01004953 }
4954
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 if (consumed)
4957 *consumed = s - starts;
4958
Victor Stinner1d65d912015-10-05 13:43:50 +02004959 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004961 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962
4963onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004964 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004968}
4969
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970#ifdef __APPLE__
4971
4972/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004973 used to decode the command line arguments on Mac OS X.
4974
4975 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004976 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004977
4978wchar_t*
4979_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4980{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004981 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 wchar_t *unicode;
4983 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985 /* Note: size will always be longer than the resulting Unicode
4986 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004987 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004989 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990 if (!unicode)
4991 return NULL;
4992
4993 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 if (ch > 0xFF) {
5004#if SIZEOF_WCHAR_T == 4
5005 assert(0);
5006#else
5007 assert(Py_UNICODE_IS_SURROGATE(ch));
5008 /* compute and append the two surrogates: */
5009 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5010 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5011#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005012 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 else {
5014 if (!ch && s == e)
5015 break;
5016 /* surrogateescape */
5017 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5018 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005019 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005021 return unicode;
5022}
5023
5024#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026/* Primary internal function which creates utf8 encoded bytes objects.
5027
5028 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005029 and allocate exactly as much space needed at the end. Else allocate the
5030 maximum possible needed (4 result bytes per Unicode character), and return
5031 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005032*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005033PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005034_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035{
Victor Stinner6099a032011-12-18 14:22:26 +01005036 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 void *data;
5038 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040 if (!PyUnicode_Check(unicode)) {
5041 PyErr_BadArgument();
5042 return NULL;
5043 }
5044
5045 if (PyUnicode_READY(unicode) == -1)
5046 return NULL;
5047
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005048 if (PyUnicode_UTF8(unicode))
5049 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5050 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005051
5052 kind = PyUnicode_KIND(unicode);
5053 data = PyUnicode_DATA(unicode);
5054 size = PyUnicode_GET_LENGTH(unicode);
5055
Benjamin Petersonead6b532011-12-20 17:23:42 -06005056 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005057 default:
5058 assert(0);
5059 case PyUnicode_1BYTE_KIND:
5060 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5061 assert(!PyUnicode_IS_ASCII(unicode));
5062 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5063 case PyUnicode_2BYTE_KIND:
5064 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5065 case PyUnicode_4BYTE_KIND:
5066 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068}
5069
Alexander Belopolsky40018472011-02-26 01:02:56 +00005070PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5072 Py_ssize_t size,
5073 const char *errors)
5074{
5075 PyObject *v, *unicode;
5076
5077 unicode = PyUnicode_FromUnicode(s, size);
5078 if (unicode == NULL)
5079 return NULL;
5080 v = _PyUnicode_AsUTF8String(unicode, errors);
5081 Py_DECREF(unicode);
5082 return v;
5083}
5084
5085PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005086PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005088 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089}
5090
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091/* --- UTF-32 Codec ------------------------------------------------------- */
5092
5093PyObject *
5094PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 Py_ssize_t size,
5096 const char *errors,
5097 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098{
5099 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5100}
5101
5102PyObject *
5103PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 Py_ssize_t size,
5105 const char *errors,
5106 int *byteorder,
5107 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108{
5109 const char *starts = s;
5110 Py_ssize_t startinpos;
5111 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005112 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005113 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005114 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005115 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 PyObject *errorHandler = NULL;
5118 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005119
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120 q = (unsigned char *)s;
5121 e = q + size;
5122
5123 if (byteorder)
5124 bo = *byteorder;
5125
5126 /* Check for BOM marks (U+FEFF) in the input and adjust current
5127 byte order setting accordingly. In native mode, the leading BOM
5128 mark is skipped, in all other modes, it is copied to the output
5129 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005130 if (bo == 0 && size >= 4) {
5131 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5132 if (bom == 0x0000FEFF) {
5133 bo = -1;
5134 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005136 else if (bom == 0xFFFE0000) {
5137 bo = 1;
5138 q += 4;
5139 }
5140 if (byteorder)
5141 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142 }
5143
Victor Stinnere64322e2012-10-30 23:12:47 +01005144 if (q == e) {
5145 if (consumed)
5146 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005147 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148 }
5149
Victor Stinnere64322e2012-10-30 23:12:47 +01005150#ifdef WORDS_BIGENDIAN
5151 le = bo < 0;
5152#else
5153 le = bo <= 0;
5154#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005155 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005156
Victor Stinner8f674cc2013-04-17 23:02:17 +02005157 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005158 writer.min_length = (e - q + 3) / 4;
5159 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005160 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005161
Victor Stinnere64322e2012-10-30 23:12:47 +01005162 while (1) {
5163 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005164 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005165
Victor Stinnere64322e2012-10-30 23:12:47 +01005166 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 enum PyUnicode_Kind kind = writer.kind;
5168 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005169 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005170 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005171 if (le) {
5172 do {
5173 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5174 if (ch > maxch)
5175 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 if (kind != PyUnicode_1BYTE_KIND &&
5177 Py_UNICODE_IS_SURROGATE(ch))
5178 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005179 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005180 q += 4;
5181 } while (q <= last);
5182 }
5183 else {
5184 do {
5185 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5186 if (ch > maxch)
5187 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005188 if (kind != PyUnicode_1BYTE_KIND &&
5189 Py_UNICODE_IS_SURROGATE(ch))
5190 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005191 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005192 q += 4;
5193 } while (q <= last);
5194 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005195 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005196 }
5197
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005198 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005199 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005200 startinpos = ((const char *)q) - starts;
5201 endinpos = startinpos + 4;
5202 }
5203 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005204 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005206 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005208 startinpos = ((const char *)q) - starts;
5209 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005211 else {
5212 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005213 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005214 goto onError;
5215 q += 4;
5216 continue;
5217 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005218 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005219 startinpos = ((const char *)q) - starts;
5220 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005222
5223 /* The remaining input chars are ignored if the callback
5224 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005225 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005231 }
5232
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 Py_XDECREF(errorHandler);
5237 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005238 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005239
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 Py_XDECREF(errorHandler);
5243 Py_XDECREF(exc);
5244 return NULL;
5245}
5246
5247PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248_PyUnicode_EncodeUTF32(PyObject *str,
5249 const char *errors,
5250 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005252 enum PyUnicode_Kind kind;
5253 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005254 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005255 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005256 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005257#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005258 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005260 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005261#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005262 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005263 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005264 PyObject *errorHandler = NULL;
5265 PyObject *exc = NULL;
5266 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005268 if (!PyUnicode_Check(str)) {
5269 PyErr_BadArgument();
5270 return NULL;
5271 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005272 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005273 return NULL;
5274 kind = PyUnicode_KIND(str);
5275 data = PyUnicode_DATA(str);
5276 len = PyUnicode_GET_LENGTH(str);
5277
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005278 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005279 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005280 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005281 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282 if (v == NULL)
5283 return NULL;
5284
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005285 /* output buffer is 4-bytes aligned */
5286 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5287 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005289 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005290 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005291 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005293 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005294 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005295 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005296 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005297 else
5298 encoding = "utf-32";
5299
5300 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005301 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5302 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005303 }
5304
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005305 pos = 0;
5306 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005307 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005308
5309 if (kind == PyUnicode_2BYTE_KIND) {
5310 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5311 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005313 else {
5314 assert(kind == PyUnicode_4BYTE_KIND);
5315 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5316 &out, native_ordering);
5317 }
5318 if (pos == len)
5319 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005320
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005321 rep = unicode_encode_call_errorhandler(
5322 errors, &errorHandler,
5323 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005324 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 if (!rep)
5326 goto error;
5327
5328 if (PyBytes_Check(rep)) {
5329 repsize = PyBytes_GET_SIZE(rep);
5330 if (repsize & 3) {
5331 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005332 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 "surrogates not allowed");
5334 goto error;
5335 }
5336 moreunits = repsize / 4;
5337 }
5338 else {
5339 assert(PyUnicode_Check(rep));
5340 if (PyUnicode_READY(rep) < 0)
5341 goto error;
5342 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5343 if (!PyUnicode_IS_ASCII(rep)) {
5344 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005345 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005346 "surrogates not allowed");
5347 goto error;
5348 }
5349 }
5350
5351 /* four bytes are reserved for each surrogate */
5352 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 Py_ssize_t morebytes = 4 * (moreunits - 1);
5355 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5356 /* integer overflow */
5357 PyErr_NoMemory();
5358 goto error;
5359 }
5360 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5361 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005363 }
5364
5365 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005366 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5367 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005368 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005369 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5371 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005372 }
5373
5374 Py_CLEAR(rep);
5375 }
5376
5377 /* Cut back to size actually needed. This is necessary for, for example,
5378 encoding of a string containing isolated surrogates and the 'ignore'
5379 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005381 if (nsize != PyBytes_GET_SIZE(v))
5382 _PyBytes_Resize(&v, nsize);
5383 Py_XDECREF(errorHandler);
5384 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005386 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 error:
5388 Py_XDECREF(rep);
5389 Py_XDECREF(errorHandler);
5390 Py_XDECREF(exc);
5391 Py_XDECREF(v);
5392 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393}
5394
Alexander Belopolsky40018472011-02-26 01:02:56 +00005395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5397 Py_ssize_t size,
5398 const char *errors,
5399 int byteorder)
5400{
5401 PyObject *result;
5402 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5403 if (tmp == NULL)
5404 return NULL;
5405 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5406 Py_DECREF(tmp);
5407 return result;
5408}
5409
5410PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005411PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005412{
Victor Stinnerb960b342011-11-20 19:12:52 +01005413 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414}
5415
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416/* --- UTF-16 Codec ------------------------------------------------------- */
5417
Tim Peters772747b2001-08-09 22:21:55 +00005418PyObject *
5419PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 Py_ssize_t size,
5421 const char *errors,
5422 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
Walter Dörwald69652032004-09-07 20:24:22 +00005424 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5425}
5426
5427PyObject *
5428PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 Py_ssize_t size,
5430 const char *errors,
5431 int *byteorder,
5432 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t startinpos;
5436 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005438 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005439 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005440 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005441 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 PyObject *errorHandler = NULL;
5443 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
Tim Peters772747b2001-08-09 22:21:55 +00005446 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005447 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448
5449 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005450 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005452 /* Check for BOM marks (U+FEFF) in the input and adjust current
5453 byte order setting accordingly. In native mode, the leading BOM
5454 mark is skipped, in all other modes, it is copied to the output
5455 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005456 if (bo == 0 && size >= 2) {
5457 const Py_UCS4 bom = (q[1] << 8) | q[0];
5458 if (bom == 0xFEFF) {
5459 q += 2;
5460 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005462 else if (bom == 0xFFFE) {
5463 q += 2;
5464 bo = 1;
5465 }
5466 if (byteorder)
5467 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
Antoine Pitrou63065d72012-05-15 23:48:04 +02005470 if (q == e) {
5471 if (consumed)
5472 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005473 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005474 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005475
Christian Heimes743e0cd2012-10-17 23:52:17 +02005476#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005477 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005479#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005480 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005482#endif
Tim Peters772747b2001-08-09 22:21:55 +00005483
Antoine Pitrou63065d72012-05-15 23:48:04 +02005484 /* Note: size will always be longer than the resulting Unicode
5485 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005486 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005487 writer.min_length = (e - q + 1) / 2;
5488 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005489 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005490
Antoine Pitrou63065d72012-05-15 23:48:04 +02005491 while (1) {
5492 Py_UCS4 ch = 0;
5493 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005494 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005496 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005497 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005498 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005499 native_ordering);
5500 else
5501 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005502 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005503 native_ordering);
5504 } else if (kind == PyUnicode_2BYTE_KIND) {
5505 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005506 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005507 native_ordering);
5508 } else {
5509 assert(kind == PyUnicode_4BYTE_KIND);
5510 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005511 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005512 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005513 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005514 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515
Antoine Pitrou63065d72012-05-15 23:48:04 +02005516 switch (ch)
5517 {
5518 case 0:
5519 /* remaining byte at the end? (size should be even) */
5520 if (q == e || consumed)
5521 goto End;
5522 errmsg = "truncated data";
5523 startinpos = ((const char *)q) - starts;
5524 endinpos = ((const char *)e) - starts;
5525 break;
5526 /* The remaining input chars are ignored if the callback
5527 chooses to skip the input */
5528 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005529 q -= 2;
5530 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005531 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005532 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005533 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005534 endinpos = ((const char *)e) - starts;
5535 break;
5536 case 2:
5537 errmsg = "illegal encoding";
5538 startinpos = ((const char *)q) - 2 - starts;
5539 endinpos = startinpos + 2;
5540 break;
5541 case 3:
5542 errmsg = "illegal UTF-16 surrogate";
5543 startinpos = ((const char *)q) - 4 - starts;
5544 endinpos = startinpos + 2;
5545 break;
5546 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005547 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005548 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 continue;
5550 }
5551
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005552 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005553 errors,
5554 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005556 &starts,
5557 (const char **)&e,
5558 &startinpos,
5559 &endinpos,
5560 &exc,
5561 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005562 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 }
5565
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566End:
Walter Dörwald69652032004-09-07 20:24:22 +00005567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 return NULL;
5579}
5580
Tim Peters772747b2001-08-09 22:21:55 +00005581PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005582_PyUnicode_EncodeUTF16(PyObject *str,
5583 const char *errors,
5584 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005586 enum PyUnicode_Kind kind;
5587 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005588 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005589 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005590 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005591 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005593 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005594#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005595 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005596#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 const char *encoding;
5598 Py_ssize_t nsize, pos;
5599 PyObject *errorHandler = NULL;
5600 PyObject *exc = NULL;
5601 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005602
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603 if (!PyUnicode_Check(str)) {
5604 PyErr_BadArgument();
5605 return NULL;
5606 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005607 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005608 return NULL;
5609 kind = PyUnicode_KIND(str);
5610 data = PyUnicode_DATA(str);
5611 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005612
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005613 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005614 if (kind == PyUnicode_4BYTE_KIND) {
5615 const Py_UCS4 *in = (const Py_UCS4 *)data;
5616 const Py_UCS4 *end = in + len;
5617 while (in < end)
5618 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005620 }
5621 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005623 nsize = len + pairs + (byteorder == 0);
5624 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 if (v == NULL)
5626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005628 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005629 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005630 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005632 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005634 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005635
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005636 if (kind == PyUnicode_1BYTE_KIND) {
5637 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5638 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005639 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005640
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005641 if (byteorder < 0)
5642 encoding = "utf-16-le";
5643 else if (byteorder > 0)
5644 encoding = "utf-16-be";
5645 else
5646 encoding = "utf-16";
5647
5648 pos = 0;
5649 while (pos < len) {
5650 Py_ssize_t repsize, moreunits;
5651
5652 if (kind == PyUnicode_2BYTE_KIND) {
5653 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5654 &out, native_ordering);
5655 }
5656 else {
5657 assert(kind == PyUnicode_4BYTE_KIND);
5658 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5659 &out, native_ordering);
5660 }
5661 if (pos == len)
5662 break;
5663
5664 rep = unicode_encode_call_errorhandler(
5665 errors, &errorHandler,
5666 encoding, "surrogates not allowed",
5667 str, &exc, pos, pos + 1, &pos);
5668 if (!rep)
5669 goto error;
5670
5671 if (PyBytes_Check(rep)) {
5672 repsize = PyBytes_GET_SIZE(rep);
5673 if (repsize & 1) {
5674 raise_encode_exception(&exc, encoding,
5675 str, pos - 1, pos,
5676 "surrogates not allowed");
5677 goto error;
5678 }
5679 moreunits = repsize / 2;
5680 }
5681 else {
5682 assert(PyUnicode_Check(rep));
5683 if (PyUnicode_READY(rep) < 0)
5684 goto error;
5685 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5686 if (!PyUnicode_IS_ASCII(rep)) {
5687 raise_encode_exception(&exc, encoding,
5688 str, pos - 1, pos,
5689 "surrogates not allowed");
5690 goto error;
5691 }
5692 }
5693
5694 /* two bytes are reserved for each surrogate */
5695 if (moreunits > 1) {
5696 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5697 Py_ssize_t morebytes = 2 * (moreunits - 1);
5698 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5699 /* integer overflow */
5700 PyErr_NoMemory();
5701 goto error;
5702 }
5703 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5704 goto error;
5705 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5706 }
5707
5708 if (PyBytes_Check(rep)) {
5709 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5710 out += moreunits;
5711 } else /* rep is unicode */ {
5712 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5713 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5714 &out, native_ordering);
5715 }
5716
5717 Py_CLEAR(rep);
5718 }
5719
5720 /* Cut back to size actually needed. This is necessary for, for example,
5721 encoding of a string containing isolated surrogates and the 'ignore' handler
5722 is used. */
5723 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5724 if (nsize != PyBytes_GET_SIZE(v))
5725 _PyBytes_Resize(&v, nsize);
5726 Py_XDECREF(errorHandler);
5727 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005728 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005729 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 error:
5731 Py_XDECREF(rep);
5732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
5734 Py_XDECREF(v);
5735 return NULL;
5736#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737}
5738
Alexander Belopolsky40018472011-02-26 01:02:56 +00005739PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5741 Py_ssize_t size,
5742 const char *errors,
5743 int byteorder)
5744{
5745 PyObject *result;
5746 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5747 if (tmp == NULL)
5748 return NULL;
5749 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5750 Py_DECREF(tmp);
5751 return result;
5752}
5753
5754PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005755PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
5760/* --- Unicode Escape Codec ----------------------------------------------- */
5761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5763 if all the escapes in the string make it still a valid ASCII string.
5764 Returns -1 if any escapes were found which cause the string to
5765 pop out of ASCII range. Otherwise returns the length of the
5766 required buffer to hold the string.
5767 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005768static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5770{
5771 const unsigned char *p = (const unsigned char *)s;
5772 const unsigned char *end = p + size;
5773 Py_ssize_t length = 0;
5774
5775 if (size < 0)
5776 return -1;
5777
5778 for (; p < end; ++p) {
5779 if (*p > 127) {
5780 /* Non-ASCII */
5781 return -1;
5782 }
5783 else if (*p != '\\') {
5784 /* Normal character */
5785 ++length;
5786 }
5787 else {
5788 /* Backslash-escape, check next char */
5789 ++p;
5790 /* Escape sequence reaches till end of string or
5791 non-ASCII follow-up. */
5792 if (p >= end || *p > 127)
5793 return -1;
5794 switch (*p) {
5795 case '\n':
5796 /* backslash + \n result in zero characters */
5797 break;
5798 case '\\': case '\'': case '\"':
5799 case 'b': case 'f': case 't':
5800 case 'n': case 'r': case 'v': case 'a':
5801 ++length;
5802 break;
5803 case '0': case '1': case '2': case '3':
5804 case '4': case '5': case '6': case '7':
5805 case 'x': case 'u': case 'U': case 'N':
5806 /* these do not guarantee ASCII characters */
5807 return -1;
5808 default:
5809 /* count the backslash + the other character */
5810 length += 2;
5811 }
5812 }
5813 }
5814 return length;
5815}
5816
Fredrik Lundh06d12682001-01-24 07:59:11 +00005817static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005818
Alexander Belopolsky40018472011-02-26 01:02:56 +00005819PyObject *
5820PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005821 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005825 Py_ssize_t startinpos;
5826 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005827 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 char* message;
5830 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 PyObject *errorHandler = NULL;
5832 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005833 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005834
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005836 if (len == 0)
5837 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838
5839 /* After length_of_escaped_ascii_string() there are two alternatives,
5840 either the string is pure ASCII with named escapes like \n, etc.
5841 and we determined it's exact size (common case)
5842 or it contains \x, \u, ... escape sequences. then we create a
5843 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005844 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005846 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005847 }
5848 else {
5849 /* Escaped strings will always be longer than the resulting
5850 Unicode string, so we start with size here and then reduce the
5851 length after conversion to the true value.
5852 (but if the error callback returns a long replacement string
5853 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005854 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 }
5856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 while (s < end) {
5862 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005863 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
5866 /* Non-escape characters are interpreted as Unicode ordinals */
5867 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868 x = (unsigned char)*s;
5869 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005870 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005871 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 continue;
5873 }
5874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 /* \ - Escapes */
5877 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005878 c = *s++;
5879 if (s > end)
5880 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005882 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005885#define WRITECHAR(ch) \
5886 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005887 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005888 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 case '\\': WRITECHAR('\\'); break;
5893 case '\'': WRITECHAR('\''); break;
5894 case '\"': WRITECHAR('\"'); break;
5895 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005897 case 'f': WRITECHAR('\014'); break;
5898 case 't': WRITECHAR('\t'); break;
5899 case 'n': WRITECHAR('\n'); break;
5900 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005902 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005903 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 case '0': case '1': case '2': case '3':
5908 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005909 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005910 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005911 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005912 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005913 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005915 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 break;
5917
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 /* hex escapes */
5919 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921 digits = 2;
5922 message = "truncated \\xXX escape";
5923 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005927 digits = 4;
5928 message = "truncated \\uXXXX escape";
5929 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005932 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933 digits = 8;
5934 message = "truncated \\UXXXXXXXX escape";
5935 hexescape:
5936 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005937 if (end - s < digits) {
5938 /* count only hex digits */
5939 for (; s < end; ++s) {
5940 c = (unsigned char)*s;
5941 if (!Py_ISXDIGIT(c))
5942 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005943 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005944 goto error;
5945 }
5946 for (; digits--; ++s) {
5947 c = (unsigned char)*s;
5948 if (!Py_ISXDIGIT(c))
5949 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005950 chr = (chr<<4) & ~0xF;
5951 if (c >= '0' && c <= '9')
5952 chr += c - '0';
5953 else if (c >= 'a' && c <= 'f')
5954 chr += 10 + c - 'a';
5955 else
5956 chr += 10 + c - 'A';
5957 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 /* _decoding_error will have already written into the
5960 target buffer. */
5961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005963 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005964 message = "illegal Unicode character";
5965 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005966 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005967 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005968 break;
5969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971 case 'N':
5972 message = "malformed \\N character escape";
5973 if (ucnhash_CAPI == NULL) {
5974 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5976 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005977 if (ucnhash_CAPI == NULL)
5978 goto ucnhashError;
5979 }
5980 if (*s == '{') {
5981 const char *start = s+1;
5982 /* look for the closing brace */
5983 while (*s != '}' && s < end)
5984 s++;
5985 if (s > start && s < end && *s == '}') {
5986 /* found a name. look it up in the unicode database */
5987 message = "unknown Unicode character name";
5988 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005989 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005990 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005991 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 goto store;
5993 }
5994 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005995 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005996
5997 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005998 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999 message = "\\ at end of string";
6000 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006001 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006002 }
6003 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006004 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006005 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006006 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006009 continue;
6010
6011 error:
6012 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006013 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006014 errors, &errorHandler,
6015 "unicodeescape", message,
6016 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006017 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006018 goto onError;
6019 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006021#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006022
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006025 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006028 PyErr_SetString(
6029 PyExc_UnicodeError,
6030 "\\N escapes not supported (can't load unicodedata module)"
6031 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006035 return NULL;
6036
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006038 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
6042}
6043
6044/* Return a Unicode-Escape string version of the Unicode object.
6045
6046 If quotes is true, the string is enclosed in u"" or u'' quotes as
6047 appropriate.
6048
6049*/
6050
Alexander Belopolsky40018472011-02-26 01:02:56 +00006051PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006055 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 int kind;
6058 void *data;
6059 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060
Ezio Melottie7f90372012-10-05 03:33:31 +03006061 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006062 escape.
6063
Ezio Melottie7f90372012-10-05 03:33:31 +03006064 For UCS1 strings it's '\xxx', 4 bytes per source character.
6065 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6066 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006067 */
6068
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006069 if (!PyUnicode_Check(unicode)) {
6070 PyErr_BadArgument();
6071 return NULL;
6072 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006073 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006074 return NULL;
6075 len = PyUnicode_GET_LENGTH(unicode);
6076 kind = PyUnicode_KIND(unicode);
6077 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006078 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006079 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6080 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6081 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6082 }
6083
6084 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006085 return PyBytes_FromStringAndSize(NULL, 0);
6086
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006087 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006090 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006092 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 if (repr == NULL)
6095 return NULL;
6096
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006097 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006099 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006100 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006101
Walter Dörwald79e913e2007-05-12 11:08:06 +00006102 /* Escape backslashes */
6103 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 *p++ = '\\';
6105 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006106 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006107 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006108
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006109 /* Map 21-bit characters to '\U00xxxxxx' */
6110 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006111 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006112 *p++ = '\\';
6113 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006114 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6115 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6116 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6117 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6118 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6119 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6120 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6121 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006123 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006124
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006126 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 *p++ = '\\';
6128 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006129 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6130 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6131 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6132 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006134
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006135 /* Map special whitespace to '\t', \n', '\r' */
6136 else if (ch == '\t') {
6137 *p++ = '\\';
6138 *p++ = 't';
6139 }
6140 else if (ch == '\n') {
6141 *p++ = '\\';
6142 *p++ = 'n';
6143 }
6144 else if (ch == '\r') {
6145 *p++ = '\\';
6146 *p++ = 'r';
6147 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006148
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006149 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006150 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006152 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006153 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6154 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006156
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 /* Copy everything else as-is */
6158 else
6159 *p++ = (char) ch;
6160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006162 assert(p - PyBytes_AS_STRING(repr) > 0);
6163 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6164 return NULL;
6165 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166}
6167
Alexander Belopolsky40018472011-02-26 01:02:56 +00006168PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6170 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 PyObject *result;
6173 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6174 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 result = PyUnicode_AsUnicodeEscapeString(tmp);
6177 Py_DECREF(tmp);
6178 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179}
6180
6181/* --- Raw Unicode Escape Codec ------------------------------------------- */
6182
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183PyObject *
6184PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006185 Py_ssize_t size,
6186 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006189 Py_ssize_t startinpos;
6190 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006191 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 const char *end;
6193 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 PyObject *errorHandler = NULL;
6195 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006196
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006197 if (size == 0)
6198 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006199
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 /* Escaped strings will always be longer than the resulting
6201 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 length after conversion to the true value. (But decoding error
6203 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006204 _PyUnicodeWriter_Init(&writer);
6205 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 end = s + size;
6208 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 unsigned char c;
6210 Py_UCS4 x;
6211 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006212 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* Non-escape characters are interpreted as Unicode ordinals */
6215 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006216 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006217 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006218 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 startinpos = s-starts;
6222
6223 /* \u-escapes are only interpreted iff the number of leading
6224 backslashes if odd */
6225 bs = s;
6226 for (;s < end;) {
6227 if (*s != '\\')
6228 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006229 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006230 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006231 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 }
6233 if (((s - bs) & 1) == 0 ||
6234 s >= end ||
6235 (*s != 'u' && *s != 'U')) {
6236 continue;
6237 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006238 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 count = *s=='u' ? 4 : 8;
6240 s++;
6241
6242 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 for (x = 0, i = 0; i < count; ++i, ++s) {
6244 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006245 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006247 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 errors, &errorHandler,
6249 "rawunicodeescape", "truncated \\uXXXX",
6250 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006251 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 goto onError;
6253 goto nextByte;
6254 }
6255 x = (x<<4) & ~0xF;
6256 if (c >= '0' && c <= '9')
6257 x += c - '0';
6258 else if (c >= 'a' && c <= 'f')
6259 x += 10 + c - 'a';
6260 else
6261 x += 10 + c - 'A';
6262 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006263 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006264 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006266 }
6267 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006268 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006269 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006270 errors, &errorHandler,
6271 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006273 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006275 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 nextByte:
6277 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 Py_XDECREF(errorHandler);
6280 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006281 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006282
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006284 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 Py_XDECREF(errorHandler);
6286 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 return NULL;
6288}
6289
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006290
Alexander Belopolsky40018472011-02-26 01:02:56 +00006291PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006292PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006294 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 char *p;
6296 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006297 Py_ssize_t expandsize, pos;
6298 int kind;
6299 void *data;
6300 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006302 if (!PyUnicode_Check(unicode)) {
6303 PyErr_BadArgument();
6304 return NULL;
6305 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006306 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006307 return NULL;
6308 kind = PyUnicode_KIND(unicode);
6309 data = PyUnicode_DATA(unicode);
6310 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006311 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6312 bytes, and 1 byte characters 4. */
6313 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006314
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006317
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 if (repr == NULL)
6320 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006321 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006322 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006324 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 for (pos = 0; pos < len; pos++) {
6326 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* Map 32-bit characters to '\Uxxxxxxxx' */
6328 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006329 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006330 *p++ = '\\';
6331 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006332 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6333 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6334 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6335 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6336 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6337 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6338 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6339 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006340 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 *p++ = '\\';
6344 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006345 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6346 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6347 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6348 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 /* Copy everything else as-is */
6351 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 *p++ = (char) ch;
6353 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006354
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006355 assert(p > q);
6356 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006357 return NULL;
6358 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359}
6360
Alexander Belopolsky40018472011-02-26 01:02:56 +00006361PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006362PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6363 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006365 PyObject *result;
6366 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6367 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006368 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006369 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6370 Py_DECREF(tmp);
6371 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372}
6373
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374/* --- Unicode Internal Codec ------------------------------------------- */
6375
Alexander Belopolsky40018472011-02-26 01:02:56 +00006376PyObject *
6377_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006378 Py_ssize_t size,
6379 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006380{
6381 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006382 Py_ssize_t startinpos;
6383 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006384 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006385 const char *end;
6386 const char *reason;
6387 PyObject *errorHandler = NULL;
6388 PyObject *exc = NULL;
6389
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006390 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006391 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006392 1))
6393 return NULL;
6394
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006395 if (size == 0)
6396 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397
Victor Stinner8f674cc2013-04-17 23:02:17 +02006398 _PyUnicodeWriter_Init(&writer);
6399 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6400 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006402 }
6403 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006404
Victor Stinner8f674cc2013-04-17 23:02:17 +02006405 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006406 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006407 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006408 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006409 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006410 endinpos = end-starts;
6411 reason = "truncated input";
6412 goto error;
6413 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006414 /* We copy the raw representation one byte at a time because the
6415 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006416 ((char *) &uch)[0] = s[0];
6417 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006418#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006419 ((char *) &uch)[2] = s[2];
6420 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006421#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006422 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006423#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006424 /* We have to sanity check the raw data, otherwise doom looms for
6425 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006426 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006427 endinpos = s - starts + Py_UNICODE_SIZE;
6428 reason = "illegal code point (> 0x10FFFF)";
6429 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006430 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006431#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006432 s += Py_UNICODE_SIZE;
6433#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006434 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006435 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006436 Py_UNICODE uch2;
6437 ((char *) &uch2)[0] = s[0];
6438 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006439 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006440 {
Victor Stinner551ac952011-11-29 22:58:13 +01006441 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006442 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006443 }
6444 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006445#endif
6446
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006447 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006448 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006449 continue;
6450
6451 error:
6452 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006453 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006454 errors, &errorHandler,
6455 "unicode_internal", reason,
6456 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006457 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006458 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006459 }
6460
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006461 Py_XDECREF(errorHandler);
6462 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006463 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006464
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006466 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467 Py_XDECREF(errorHandler);
6468 Py_XDECREF(exc);
6469 return NULL;
6470}
6471
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472/* --- Latin-1 Codec ------------------------------------------------------ */
6473
Alexander Belopolsky40018472011-02-26 01:02:56 +00006474PyObject *
6475PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006476 Py_ssize_t size,
6477 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006480 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481}
6482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006484static void
6485make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006486 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006487 PyObject *unicode,
6488 Py_ssize_t startpos, Py_ssize_t endpos,
6489 const char *reason)
6490{
6491 if (*exceptionObject == NULL) {
6492 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006494 encoding, unicode, startpos, endpos, reason);
6495 }
6496 else {
6497 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6498 goto onError;
6499 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6500 goto onError;
6501 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6502 goto onError;
6503 return;
6504 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006505 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006506 }
6507}
6508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006509/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006510static void
6511raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006512 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006513 PyObject *unicode,
6514 Py_ssize_t startpos, Py_ssize_t endpos,
6515 const char *reason)
6516{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006517 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006518 encoding, unicode, startpos, endpos, reason);
6519 if (*exceptionObject != NULL)
6520 PyCodec_StrictErrors(*exceptionObject);
6521}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522
6523/* error handling callback helper:
6524 build arguments, call the callback and check the arguments,
6525 put the result into newpos and return the replacement string, which
6526 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527static PyObject *
6528unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006529 PyObject **errorHandler,
6530 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006532 Py_ssize_t startpos, Py_ssize_t endpos,
6533 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006535 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006536 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537 PyObject *restuple;
6538 PyObject *resunicode;
6539
6540 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006542 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 }
6545
Benjamin Petersonbac79492012-01-14 13:34:47 -05006546 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 return NULL;
6548 len = PyUnicode_GET_LENGTH(unicode);
6549
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006550 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554
6555 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006557 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006560 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 Py_DECREF(restuple);
6562 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006564 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 &resunicode, newpos)) {
6566 Py_DECREF(restuple);
6567 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006568 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006569 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6570 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6571 Py_DECREF(restuple);
6572 return NULL;
6573 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006574 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006575 *newpos = len + *newpos;
6576 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006577 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 Py_DECREF(restuple);
6579 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006580 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006581 Py_INCREF(resunicode);
6582 Py_DECREF(restuple);
6583 return resunicode;
6584}
6585
Alexander Belopolsky40018472011-02-26 01:02:56 +00006586static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006588 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006589 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591 /* input state */
6592 Py_ssize_t pos=0, size;
6593 int kind;
6594 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 /* pointer into the output */
6596 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006597 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6598 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006599 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006601 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006602 /* output object */
6603 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604
Benjamin Petersonbac79492012-01-14 13:34:47 -05006605 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 return NULL;
6607 size = PyUnicode_GET_LENGTH(unicode);
6608 kind = PyUnicode_KIND(unicode);
6609 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610 /* allocate enough for a simple encoding without
6611 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006612 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006613 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006614
6615 _PyBytesWriter_Init(&writer);
6616 str = _PyBytesWriter_Alloc(&writer, size);
6617 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006618 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006619
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006621 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006624 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006626 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006627 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006628 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 PyObject *repunicode;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006631 Py_ssize_t repsize, newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006633 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006634 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006636
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006637 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006639
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006640 /* Only overallocate the buffer if it's not the last write */
6641 writer.overallocate = (collend < size);
6642
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006644 if (error_handler == _Py_ERROR_UNKNOWN)
6645 error_handler = get_error_handler(errors);
6646
6647 switch (error_handler) {
6648 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006649 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006651
6652 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006653 memset(str, '?', collend - collstart);
6654 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006655 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006656 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006657 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 break;
Victor Stinner50149202015-09-22 00:26:54 +02006659
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006660 case _Py_ERROR_BACKSLASHREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006661 /* substract preallocated bytes */
6662 writer.min_size -= (collend - collstart);
6663 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006664 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006665 if (str == NULL)
6666 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006667 pos = collend;
6668 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006669
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006670 case _Py_ERROR_XMLCHARREFREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006671 /* substract preallocated bytes */
6672 writer.min_size -= (collend - collstart);
6673 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006674 unicode, collstart, collend);
6675 if (str == NULL)
6676 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 break;
Victor Stinner50149202015-09-22 00:26:54 +02006679
Victor Stinnerc3713e92015-09-29 12:32:13 +02006680 case _Py_ERROR_SURROGATEESCAPE:
6681 for (i = collstart; i < collend; ++i) {
6682 ch = PyUnicode_READ(kind, data, i);
6683 if (ch < 0xdc80 || 0xdcff < ch) {
6684 /* Not a UTF-8b surrogate */
6685 break;
6686 }
6687 *str++ = (char)(ch - 0xdc00);
6688 ++pos;
6689 }
6690 if (i >= collend)
6691 break;
6692 collstart = pos;
6693 assert(collstart != collend);
6694 /* fallback to general error handling */
6695
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 default:
Victor Stinner50149202015-09-22 00:26:54 +02006697 repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698 encoding, reason, unicode, &exc,
6699 collstart, collend, &newpos);
6700 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006701 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006703
Victor Stinnerad771582015-10-09 12:38:53 +02006704 /* substract preallocated bytes */
6705 writer.min_size -= 1;
6706
Martin v. Löwis011e8422009-05-05 04:43:17 +00006707 if (PyBytes_Check(repunicode)) {
6708 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006709 str = _PyBytesWriter_WriteBytes(&writer, str,
6710 PyBytes_AS_STRING(repunicode),
6711 PyBytes_GET_SIZE(repunicode));
Victor Stinnerad771582015-10-09 12:38:53 +02006712 if (str == NULL)
6713 goto onError;
6714
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006716 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006717 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006718 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006719
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 /* need more space? (at least enough for what we
6721 have+the replacement+the rest of the string, so
6722 we won't have to check space for encodable characters) */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006723 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerad771582015-10-09 12:38:53 +02006724
6725 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6726 if (str == NULL)
6727 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006728
Benjamin Peterson29060642009-01-31 22:14:21 +00006729 /* check if there is anything unencodable in the replacement
6730 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 for (i = 0; repsize-->0; ++i, ++str) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006732 ch = PyUnicode_READ_CHAR(repunicode, i);
6733 if (ch >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006734 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 Py_DECREF(repunicode);
6737 goto onError;
6738 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006739 *str = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006741 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006742 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006743 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006744
6745 /* If overallocation was disabled, ensure that it was the last
6746 write. Otherwise, we missed an optimization */
6747 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006748 }
6749 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006750
Victor Stinner50149202015-09-22 00:26:54 +02006751 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006753 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006754
6755 onError:
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006756 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006757 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006758 Py_XDECREF(exc);
6759 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006760}
6761
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006762/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763PyObject *
6764PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006765 Py_ssize_t size,
6766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 PyObject *result;
6769 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6770 if (unicode == NULL)
6771 return NULL;
6772 result = unicode_encode_ucs1(unicode, errors, 256);
6773 Py_DECREF(unicode);
6774 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Alexander Belopolsky40018472011-02-26 01:02:56 +00006777PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779{
6780 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 PyErr_BadArgument();
6782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006784 if (PyUnicode_READY(unicode) == -1)
6785 return NULL;
6786 /* Fast path: if it is a one-byte string, construct
6787 bytes object directly. */
6788 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6789 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6790 PyUnicode_GET_LENGTH(unicode));
6791 /* Non-Latin-1 characters present. Defer to above function to
6792 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006794}
6795
6796PyObject*
6797PyUnicode_AsLatin1String(PyObject *unicode)
6798{
6799 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800}
6801
6802/* --- 7-bit ASCII Codec -------------------------------------------------- */
6803
Alexander Belopolsky40018472011-02-26 01:02:56 +00006804PyObject *
6805PyUnicode_DecodeASCII(const char *s,
6806 Py_ssize_t size,
6807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006810 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006811 int kind;
6812 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006813 Py_ssize_t startinpos;
6814 Py_ssize_t endinpos;
6815 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006817 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006818 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006819 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006822 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006823
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006825 if (size == 1 && (unsigned char)s[0] < 128)
6826 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006827
Victor Stinner8f674cc2013-04-17 23:02:17 +02006828 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006829 writer.min_length = size;
6830 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006831 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006834 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006835 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006836 writer.pos = outpos;
6837 if (writer.pos == size)
6838 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006839
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006840 s += writer.pos;
6841 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006843 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006845 PyUnicode_WRITE(kind, data, writer.pos, c);
6846 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006848 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006850
6851 /* byte outsize range 0x00..0x7f: call the error handler */
6852
6853 if (error_handler == _Py_ERROR_UNKNOWN)
6854 error_handler = get_error_handler(errors);
6855
6856 switch (error_handler)
6857 {
6858 case _Py_ERROR_REPLACE:
6859 case _Py_ERROR_SURROGATEESCAPE:
6860 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006861 but we may switch to UCS2 at the first write */
6862 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6863 goto onError;
6864 kind = writer.kind;
6865 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006866
6867 if (error_handler == _Py_ERROR_REPLACE)
6868 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6869 else
6870 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6871 writer.pos++;
6872 ++s;
6873 break;
6874
6875 case _Py_ERROR_IGNORE:
6876 ++s;
6877 break;
6878
6879 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 startinpos = s-starts;
6881 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006882 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006883 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 "ascii", "ordinal not in range(128)",
6885 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006886 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006888 kind = writer.kind;
6889 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006892 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006894 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006895
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006897 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006898 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 return NULL;
6901}
6902
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006903/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904PyObject *
6905PyUnicode_EncodeASCII(const Py_UNICODE *p,
6906 Py_ssize_t size,
6907 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006909 PyObject *result;
6910 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6911 if (unicode == NULL)
6912 return NULL;
6913 result = unicode_encode_ucs1(unicode, errors, 128);
6914 Py_DECREF(unicode);
6915 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916}
6917
Alexander Belopolsky40018472011-02-26 01:02:56 +00006918PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006919_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920{
6921 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 PyErr_BadArgument();
6923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006925 if (PyUnicode_READY(unicode) == -1)
6926 return NULL;
6927 /* Fast path: if it is an ASCII-only string, construct bytes object
6928 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006929 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006930 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6931 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006932 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006933}
6934
6935PyObject *
6936PyUnicode_AsASCIIString(PyObject *unicode)
6937{
6938 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939}
6940
Victor Stinner99b95382011-07-04 14:23:54 +02006941#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006942
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006943/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006944
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006945#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946#define NEED_RETRY
6947#endif
6948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949#ifndef WC_ERR_INVALID_CHARS
6950# define WC_ERR_INVALID_CHARS 0x0080
6951#endif
6952
6953static char*
6954code_page_name(UINT code_page, PyObject **obj)
6955{
6956 *obj = NULL;
6957 if (code_page == CP_ACP)
6958 return "mbcs";
6959 if (code_page == CP_UTF7)
6960 return "CP_UTF7";
6961 if (code_page == CP_UTF8)
6962 return "CP_UTF8";
6963
6964 *obj = PyBytes_FromFormat("cp%u", code_page);
6965 if (*obj == NULL)
6966 return NULL;
6967 return PyBytes_AS_STRING(*obj);
6968}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969
Victor Stinner3a50e702011-10-18 21:21:00 +02006970static DWORD
6971decode_code_page_flags(UINT code_page)
6972{
6973 if (code_page == CP_UTF7) {
6974 /* The CP_UTF7 decoder only supports flags=0 */
6975 return 0;
6976 }
6977 else
6978 return MB_ERR_INVALID_CHARS;
6979}
6980
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006981/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 * Decode a byte string from a Windows code page into unicode object in strict
6983 * mode.
6984 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006985 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6986 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006987 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006988static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006989decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006990 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006991 const char *in,
6992 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993{
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006995 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997
6998 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 assert(insize > 0);
7000 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7001 if (outsize <= 0)
7002 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007003
7004 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007006 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007007 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 if (*v == NULL)
7009 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007010 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011 }
7012 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007015 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007018 }
7019
7020 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7022 if (outsize <= 0)
7023 goto error;
7024 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007025
Victor Stinner3a50e702011-10-18 21:21:00 +02007026error:
7027 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7028 return -2;
7029 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007030 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031}
7032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033/*
7034 * Decode a byte string from a code page into unicode object with an error
7035 * handler.
7036 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007037 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007038 * UnicodeDecodeError exception and returns -1 on error.
7039 */
7040static int
7041decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007042 PyObject **v,
7043 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007044 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007045{
7046 const char *startin = in;
7047 const char *endin = in + size;
7048 const DWORD flags = decode_code_page_flags(code_page);
7049 /* Ideally, we should get reason from FormatMessage. This is the Windows
7050 2000 English version of the message. */
7051 const char *reason = "No mapping for the Unicode character exists "
7052 "in the target code page.";
7053 /* each step cannot decode more than 1 character, but a character can be
7054 represented as a surrogate pair */
7055 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007056 int insize;
7057 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007058 PyObject *errorHandler = NULL;
7059 PyObject *exc = NULL;
7060 PyObject *encoding_obj = NULL;
7061 char *encoding;
7062 DWORD err;
7063 int ret = -1;
7064
7065 assert(size > 0);
7066
7067 encoding = code_page_name(code_page, &encoding_obj);
7068 if (encoding == NULL)
7069 return -1;
7070
Victor Stinner7d00cc12014-03-17 23:08:06 +01007071 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7073 UnicodeDecodeError. */
7074 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7075 if (exc != NULL) {
7076 PyCodec_StrictErrors(exc);
7077 Py_CLEAR(exc);
7078 }
7079 goto error;
7080 }
7081
7082 if (*v == NULL) {
7083 /* Create unicode object */
7084 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7085 PyErr_NoMemory();
7086 goto error;
7087 }
Victor Stinnerab595942011-12-17 04:59:06 +01007088 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007089 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 if (*v == NULL)
7091 goto error;
7092 startout = PyUnicode_AS_UNICODE(*v);
7093 }
7094 else {
7095 /* Extend unicode object */
7096 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7097 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7098 PyErr_NoMemory();
7099 goto error;
7100 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007101 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 goto error;
7103 startout = PyUnicode_AS_UNICODE(*v) + n;
7104 }
7105
7106 /* Decode the byte string character per character */
7107 out = startout;
7108 while (in < endin)
7109 {
7110 /* Decode a character */
7111 insize = 1;
7112 do
7113 {
7114 outsize = MultiByteToWideChar(code_page, flags,
7115 in, insize,
7116 buffer, Py_ARRAY_LENGTH(buffer));
7117 if (outsize > 0)
7118 break;
7119 err = GetLastError();
7120 if (err != ERROR_NO_UNICODE_TRANSLATION
7121 && err != ERROR_INSUFFICIENT_BUFFER)
7122 {
7123 PyErr_SetFromWindowsErr(0);
7124 goto error;
7125 }
7126 insize++;
7127 }
7128 /* 4=maximum length of a UTF-8 sequence */
7129 while (insize <= 4 && (in + insize) <= endin);
7130
7131 if (outsize <= 0) {
7132 Py_ssize_t startinpos, endinpos, outpos;
7133
Victor Stinner7d00cc12014-03-17 23:08:06 +01007134 /* last character in partial decode? */
7135 if (in + insize >= endin && !final)
7136 break;
7137
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 startinpos = in - startin;
7139 endinpos = startinpos + 1;
7140 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007141 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 errors, &errorHandler,
7143 encoding, reason,
7144 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007145 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 {
7147 goto error;
7148 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007149 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 }
7151 else {
7152 in += insize;
7153 memcpy(out, buffer, outsize * sizeof(wchar_t));
7154 out += outsize;
7155 }
7156 }
7157
7158 /* write a NUL character at the end */
7159 *out = 0;
7160
7161 /* Extend unicode object */
7162 outsize = out - startout;
7163 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007164 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007166 /* (in - startin) <= size and size is an int */
7167 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007168
7169error:
7170 Py_XDECREF(encoding_obj);
7171 Py_XDECREF(errorHandler);
7172 Py_XDECREF(exc);
7173 return ret;
7174}
7175
Victor Stinner3a50e702011-10-18 21:21:00 +02007176static PyObject *
7177decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007178 const char *s, Py_ssize_t size,
7179 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007180{
Victor Stinner76a31a62011-11-04 00:05:13 +01007181 PyObject *v = NULL;
7182 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 if (code_page < 0) {
7185 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7186 return NULL;
7187 }
7188
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191
Victor Stinner76a31a62011-11-04 00:05:13 +01007192 do
7193 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007195 if (size > INT_MAX) {
7196 chunk_size = INT_MAX;
7197 final = 0;
7198 done = 0;
7199 }
7200 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007202 {
7203 chunk_size = (int)size;
7204 final = (consumed == NULL);
7205 done = 1;
7206 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007207
Victor Stinner76a31a62011-11-04 00:05:13 +01007208 if (chunk_size == 0 && done) {
7209 if (v != NULL)
7210 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007211 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007212 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 converted = decode_code_page_strict(code_page, &v,
7215 s, chunk_size);
7216 if (converted == -2)
7217 converted = decode_code_page_errors(code_page, &v,
7218 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007219 errors, final);
7220 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007221
7222 if (converted < 0) {
7223 Py_XDECREF(v);
7224 return NULL;
7225 }
7226
7227 if (consumed)
7228 *consumed += converted;
7229
7230 s += converted;
7231 size -= converted;
7232 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007233
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007234 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235}
7236
Alexander Belopolsky40018472011-02-26 01:02:56 +00007237PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007238PyUnicode_DecodeCodePageStateful(int code_page,
7239 const char *s,
7240 Py_ssize_t size,
7241 const char *errors,
7242 Py_ssize_t *consumed)
7243{
7244 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7245}
7246
7247PyObject *
7248PyUnicode_DecodeMBCSStateful(const char *s,
7249 Py_ssize_t size,
7250 const char *errors,
7251 Py_ssize_t *consumed)
7252{
7253 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7254}
7255
7256PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007257PyUnicode_DecodeMBCS(const char *s,
7258 Py_ssize_t size,
7259 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007260{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7262}
7263
Victor Stinner3a50e702011-10-18 21:21:00 +02007264static DWORD
7265encode_code_page_flags(UINT code_page, const char *errors)
7266{
7267 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007268 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 }
7270 else if (code_page == CP_UTF7) {
7271 /* CP_UTF7 only supports flags=0 */
7272 return 0;
7273 }
7274 else {
7275 if (errors != NULL && strcmp(errors, "replace") == 0)
7276 return 0;
7277 else
7278 return WC_NO_BEST_FIT_CHARS;
7279 }
7280}
7281
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 * Encode a Unicode string to a Windows code page into a byte string in strict
7284 * mode.
7285 *
7286 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007287 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007290encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007291 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293{
Victor Stinner554f3f02010-06-16 23:33:54 +00007294 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 BOOL *pusedDefaultChar = &usedDefaultChar;
7296 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007297 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007298 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007299 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 const DWORD flags = encode_code_page_flags(code_page, NULL);
7301 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007302 /* Create a substring so that we can get the UTF-16 representation
7303 of just the slice under consideration. */
7304 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007307
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007309 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007311 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007312
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 substring = PyUnicode_Substring(unicode, offset, offset+len);
7314 if (substring == NULL)
7315 return -1;
7316 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7317 if (p == NULL) {
7318 Py_DECREF(substring);
7319 return -1;
7320 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007321 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007323 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007325 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 NULL, 0,
7327 NULL, pusedDefaultChar);
7328 if (outsize <= 0)
7329 goto error;
7330 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007331 if (pusedDefaultChar && *pusedDefaultChar) {
7332 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007334 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007335
Victor Stinner3a50e702011-10-18 21:21:00 +02007336 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007339 if (*outbytes == NULL) {
7340 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007342 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344 }
7345 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 const Py_ssize_t n = PyBytes_Size(*outbytes);
7348 if (outsize > PY_SSIZE_T_MAX - n) {
7349 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007350 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007353 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7354 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007356 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358 }
7359
7360 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007362 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 out, outsize,
7364 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007365 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 if (outsize <= 0)
7367 goto error;
7368 if (pusedDefaultChar && *pusedDefaultChar)
7369 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007370 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007371
Victor Stinner3a50e702011-10-18 21:21:00 +02007372error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007373 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7375 return -2;
7376 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007377 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007378}
7379
Victor Stinner3a50e702011-10-18 21:21:00 +02007380/*
7381 * Encode a Unicode string to a Windows code page into a byte string using a
7382 * error handler.
7383 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007384 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 * -1 on other error.
7386 */
7387static int
7388encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007389 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007390 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007391{
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007393 Py_ssize_t pos = unicode_offset;
7394 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 /* Ideally, we should get reason from FormatMessage. This is the Windows
7396 2000 English version of the message. */
7397 const char *reason = "invalid character";
7398 /* 4=maximum length of a UTF-8 sequence */
7399 char buffer[4];
7400 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7401 Py_ssize_t outsize;
7402 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 PyObject *errorHandler = NULL;
7404 PyObject *exc = NULL;
7405 PyObject *encoding_obj = NULL;
7406 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007407 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 PyObject *rep;
7409 int ret = -1;
7410
7411 assert(insize > 0);
7412
7413 encoding = code_page_name(code_page, &encoding_obj);
7414 if (encoding == NULL)
7415 return -1;
7416
7417 if (errors == NULL || strcmp(errors, "strict") == 0) {
7418 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7419 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007420 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 if (exc != NULL) {
7422 PyCodec_StrictErrors(exc);
7423 Py_DECREF(exc);
7424 }
7425 Py_XDECREF(encoding_obj);
7426 return -1;
7427 }
7428
7429 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7430 pusedDefaultChar = &usedDefaultChar;
7431 else
7432 pusedDefaultChar = NULL;
7433
7434 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7435 PyErr_NoMemory();
7436 goto error;
7437 }
7438 outsize = insize * Py_ARRAY_LENGTH(buffer);
7439
7440 if (*outbytes == NULL) {
7441 /* Create string object */
7442 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7443 if (*outbytes == NULL)
7444 goto error;
7445 out = PyBytes_AS_STRING(*outbytes);
7446 }
7447 else {
7448 /* Extend string object */
7449 Py_ssize_t n = PyBytes_Size(*outbytes);
7450 if (n > PY_SSIZE_T_MAX - outsize) {
7451 PyErr_NoMemory();
7452 goto error;
7453 }
7454 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7455 goto error;
7456 out = PyBytes_AS_STRING(*outbytes) + n;
7457 }
7458
7459 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7463 wchar_t chars[2];
7464 int charsize;
7465 if (ch < 0x10000) {
7466 chars[0] = (wchar_t)ch;
7467 charsize = 1;
7468 }
7469 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007470 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7471 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 charsize = 2;
7473 }
7474
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007476 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 buffer, Py_ARRAY_LENGTH(buffer),
7478 NULL, pusedDefaultChar);
7479 if (outsize > 0) {
7480 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7481 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 memcpy(out, buffer, outsize);
7484 out += outsize;
7485 continue;
7486 }
7487 }
7488 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7489 PyErr_SetFromWindowsErr(0);
7490 goto error;
7491 }
7492
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 rep = unicode_encode_call_errorhandler(
7494 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007495 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 if (rep == NULL)
7498 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007499 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007500
7501 if (PyBytes_Check(rep)) {
7502 outsize = PyBytes_GET_SIZE(rep);
7503 if (outsize != 1) {
7504 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7505 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7506 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7507 Py_DECREF(rep);
7508 goto error;
7509 }
7510 out = PyBytes_AS_STRING(*outbytes) + offset;
7511 }
7512 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7513 out += outsize;
7514 }
7515 else {
7516 Py_ssize_t i;
7517 enum PyUnicode_Kind kind;
7518 void *data;
7519
Benjamin Petersonbac79492012-01-14 13:34:47 -05007520 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 Py_DECREF(rep);
7522 goto error;
7523 }
7524
7525 outsize = PyUnicode_GET_LENGTH(rep);
7526 if (outsize != 1) {
7527 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7528 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7529 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7530 Py_DECREF(rep);
7531 goto error;
7532 }
7533 out = PyBytes_AS_STRING(*outbytes) + offset;
7534 }
7535 kind = PyUnicode_KIND(rep);
7536 data = PyUnicode_DATA(rep);
7537 for (i=0; i < outsize; i++) {
7538 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7539 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007540 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007541 encoding, unicode,
7542 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 "unable to encode error handler result to ASCII");
7544 Py_DECREF(rep);
7545 goto error;
7546 }
7547 *out = (unsigned char)ch;
7548 out++;
7549 }
7550 }
7551 Py_DECREF(rep);
7552 }
7553 /* write a NUL byte */
7554 *out = 0;
7555 outsize = out - PyBytes_AS_STRING(*outbytes);
7556 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7557 if (_PyBytes_Resize(outbytes, outsize) < 0)
7558 goto error;
7559 ret = 0;
7560
7561error:
7562 Py_XDECREF(encoding_obj);
7563 Py_XDECREF(errorHandler);
7564 Py_XDECREF(exc);
7565 return ret;
7566}
7567
Victor Stinner3a50e702011-10-18 21:21:00 +02007568static PyObject *
7569encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007570 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 const char *errors)
7572{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007573 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007575 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007576 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007577
Victor Stinner29dacf22015-01-26 16:41:32 +01007578 if (!PyUnicode_Check(unicode)) {
7579 PyErr_BadArgument();
7580 return NULL;
7581 }
7582
Benjamin Petersonbac79492012-01-14 13:34:47 -05007583 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007584 return NULL;
7585 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007586
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 if (code_page < 0) {
7588 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7589 return NULL;
7590 }
7591
Martin v. Löwis3d325192011-11-04 18:23:06 +01007592 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007593 return PyBytes_FromStringAndSize(NULL, 0);
7594
Victor Stinner7581cef2011-11-03 22:32:33 +01007595 offset = 0;
7596 do
7597 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007598#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007599 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 chunks. */
7601 if (len > INT_MAX/2) {
7602 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007603 done = 0;
7604 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007605 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007606#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007607 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007608 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007609 done = 1;
7610 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007611
Victor Stinner76a31a62011-11-04 00:05:13 +01007612 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007613 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007614 errors);
7615 if (ret == -2)
7616 ret = encode_code_page_errors(code_page, &outbytes,
7617 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007618 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007619 if (ret < 0) {
7620 Py_XDECREF(outbytes);
7621 return NULL;
7622 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007623
Victor Stinner7581cef2011-11-03 22:32:33 +01007624 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007625 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007626 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007627
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 return outbytes;
7629}
7630
7631PyObject *
7632PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7633 Py_ssize_t size,
7634 const char *errors)
7635{
Victor Stinner7581cef2011-11-03 22:32:33 +01007636 PyObject *unicode, *res;
7637 unicode = PyUnicode_FromUnicode(p, size);
7638 if (unicode == NULL)
7639 return NULL;
7640 res = encode_code_page(CP_ACP, unicode, errors);
7641 Py_DECREF(unicode);
7642 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643}
7644
7645PyObject *
7646PyUnicode_EncodeCodePage(int code_page,
7647 PyObject *unicode,
7648 const char *errors)
7649{
Victor Stinner7581cef2011-11-03 22:32:33 +01007650 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007651}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007652
Alexander Belopolsky40018472011-02-26 01:02:56 +00007653PyObject *
7654PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007655{
Victor Stinner7581cef2011-11-03 22:32:33 +01007656 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007657}
7658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007659#undef NEED_RETRY
7660
Victor Stinner99b95382011-07-04 14:23:54 +02007661#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007662
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663/* --- Character Mapping Codec -------------------------------------------- */
7664
Victor Stinnerfb161b12013-04-18 01:44:27 +02007665static int
7666charmap_decode_string(const char *s,
7667 Py_ssize_t size,
7668 PyObject *mapping,
7669 const char *errors,
7670 _PyUnicodeWriter *writer)
7671{
7672 const char *starts = s;
7673 const char *e;
7674 Py_ssize_t startinpos, endinpos;
7675 PyObject *errorHandler = NULL, *exc = NULL;
7676 Py_ssize_t maplen;
7677 enum PyUnicode_Kind mapkind;
7678 void *mapdata;
7679 Py_UCS4 x;
7680 unsigned char ch;
7681
7682 if (PyUnicode_READY(mapping) == -1)
7683 return -1;
7684
7685 maplen = PyUnicode_GET_LENGTH(mapping);
7686 mapdata = PyUnicode_DATA(mapping);
7687 mapkind = PyUnicode_KIND(mapping);
7688
7689 e = s + size;
7690
7691 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7692 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7693 * is disabled in encoding aliases, latin1 is preferred because
7694 * its implementation is faster. */
7695 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7696 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7697 Py_UCS4 maxchar = writer->maxchar;
7698
7699 assert (writer->kind == PyUnicode_1BYTE_KIND);
7700 while (s < e) {
7701 ch = *s;
7702 x = mapdata_ucs1[ch];
7703 if (x > maxchar) {
7704 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7705 goto onError;
7706 maxchar = writer->maxchar;
7707 outdata = (Py_UCS1 *)writer->data;
7708 }
7709 outdata[writer->pos] = x;
7710 writer->pos++;
7711 ++s;
7712 }
7713 return 0;
7714 }
7715
7716 while (s < e) {
7717 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7718 enum PyUnicode_Kind outkind = writer->kind;
7719 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7720 if (outkind == PyUnicode_1BYTE_KIND) {
7721 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7722 Py_UCS4 maxchar = writer->maxchar;
7723 while (s < e) {
7724 ch = *s;
7725 x = mapdata_ucs2[ch];
7726 if (x > maxchar)
7727 goto Error;
7728 outdata[writer->pos] = x;
7729 writer->pos++;
7730 ++s;
7731 }
7732 break;
7733 }
7734 else if (outkind == PyUnicode_2BYTE_KIND) {
7735 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7736 while (s < e) {
7737 ch = *s;
7738 x = mapdata_ucs2[ch];
7739 if (x == 0xFFFE)
7740 goto Error;
7741 outdata[writer->pos] = x;
7742 writer->pos++;
7743 ++s;
7744 }
7745 break;
7746 }
7747 }
7748 ch = *s;
7749
7750 if (ch < maplen)
7751 x = PyUnicode_READ(mapkind, mapdata, ch);
7752 else
7753 x = 0xfffe; /* invalid value */
7754Error:
7755 if (x == 0xfffe)
7756 {
7757 /* undefined mapping */
7758 startinpos = s-starts;
7759 endinpos = startinpos+1;
7760 if (unicode_decode_call_errorhandler_writer(
7761 errors, &errorHandler,
7762 "charmap", "character maps to <undefined>",
7763 &starts, &e, &startinpos, &endinpos, &exc, &s,
7764 writer)) {
7765 goto onError;
7766 }
7767 continue;
7768 }
7769
7770 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7771 goto onError;
7772 ++s;
7773 }
7774 Py_XDECREF(errorHandler);
7775 Py_XDECREF(exc);
7776 return 0;
7777
7778onError:
7779 Py_XDECREF(errorHandler);
7780 Py_XDECREF(exc);
7781 return -1;
7782}
7783
7784static int
7785charmap_decode_mapping(const char *s,
7786 Py_ssize_t size,
7787 PyObject *mapping,
7788 const char *errors,
7789 _PyUnicodeWriter *writer)
7790{
7791 const char *starts = s;
7792 const char *e;
7793 Py_ssize_t startinpos, endinpos;
7794 PyObject *errorHandler = NULL, *exc = NULL;
7795 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007796 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007797
7798 e = s + size;
7799
7800 while (s < e) {
7801 ch = *s;
7802
7803 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7804 key = PyLong_FromLong((long)ch);
7805 if (key == NULL)
7806 goto onError;
7807
7808 item = PyObject_GetItem(mapping, key);
7809 Py_DECREF(key);
7810 if (item == NULL) {
7811 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7812 /* No mapping found means: mapping is undefined. */
7813 PyErr_Clear();
7814 goto Undefined;
7815 } else
7816 goto onError;
7817 }
7818
7819 /* Apply mapping */
7820 if (item == Py_None)
7821 goto Undefined;
7822 if (PyLong_Check(item)) {
7823 long value = PyLong_AS_LONG(item);
7824 if (value == 0xFFFE)
7825 goto Undefined;
7826 if (value < 0 || value > MAX_UNICODE) {
7827 PyErr_Format(PyExc_TypeError,
7828 "character mapping must be in range(0x%lx)",
7829 (unsigned long)MAX_UNICODE + 1);
7830 goto onError;
7831 }
7832
7833 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7834 goto onError;
7835 }
7836 else if (PyUnicode_Check(item)) {
7837 if (PyUnicode_READY(item) == -1)
7838 goto onError;
7839 if (PyUnicode_GET_LENGTH(item) == 1) {
7840 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7841 if (value == 0xFFFE)
7842 goto Undefined;
7843 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7844 goto onError;
7845 }
7846 else {
7847 writer->overallocate = 1;
7848 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7849 goto onError;
7850 }
7851 }
7852 else {
7853 /* wrong return value */
7854 PyErr_SetString(PyExc_TypeError,
7855 "character mapping must return integer, None or str");
7856 goto onError;
7857 }
7858 Py_CLEAR(item);
7859 ++s;
7860 continue;
7861
7862Undefined:
7863 /* undefined mapping */
7864 Py_CLEAR(item);
7865 startinpos = s-starts;
7866 endinpos = startinpos+1;
7867 if (unicode_decode_call_errorhandler_writer(
7868 errors, &errorHandler,
7869 "charmap", "character maps to <undefined>",
7870 &starts, &e, &startinpos, &endinpos, &exc, &s,
7871 writer)) {
7872 goto onError;
7873 }
7874 }
7875 Py_XDECREF(errorHandler);
7876 Py_XDECREF(exc);
7877 return 0;
7878
7879onError:
7880 Py_XDECREF(item);
7881 Py_XDECREF(errorHandler);
7882 Py_XDECREF(exc);
7883 return -1;
7884}
7885
Alexander Belopolsky40018472011-02-26 01:02:56 +00007886PyObject *
7887PyUnicode_DecodeCharmap(const char *s,
7888 Py_ssize_t size,
7889 PyObject *mapping,
7890 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007892 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007893
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 /* Default to Latin-1 */
7895 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007899 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007900 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007901 writer.min_length = size;
7902 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007904
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007905 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007906 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7907 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007908 }
7909 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007910 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007913 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007914
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007916 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 return NULL;
7918}
7919
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920/* Charmap encoding: the lookup table */
7921
Alexander Belopolsky40018472011-02-26 01:02:56 +00007922struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007923 PyObject_HEAD
7924 unsigned char level1[32];
7925 int count2, count3;
7926 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007927};
7928
7929static PyObject*
7930encoding_map_size(PyObject *obj, PyObject* args)
7931{
7932 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007933 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007935}
7936
7937static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007938 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 PyDoc_STR("Return the size (in bytes) of this object") },
7940 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941};
7942
7943static void
7944encoding_map_dealloc(PyObject* o)
7945{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007946 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947}
7948
7949static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007950 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 "EncodingMap", /*tp_name*/
7952 sizeof(struct encoding_map), /*tp_basicsize*/
7953 0, /*tp_itemsize*/
7954 /* methods */
7955 encoding_map_dealloc, /*tp_dealloc*/
7956 0, /*tp_print*/
7957 0, /*tp_getattr*/
7958 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007959 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 0, /*tp_repr*/
7961 0, /*tp_as_number*/
7962 0, /*tp_as_sequence*/
7963 0, /*tp_as_mapping*/
7964 0, /*tp_hash*/
7965 0, /*tp_call*/
7966 0, /*tp_str*/
7967 0, /*tp_getattro*/
7968 0, /*tp_setattro*/
7969 0, /*tp_as_buffer*/
7970 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7971 0, /*tp_doc*/
7972 0, /*tp_traverse*/
7973 0, /*tp_clear*/
7974 0, /*tp_richcompare*/
7975 0, /*tp_weaklistoffset*/
7976 0, /*tp_iter*/
7977 0, /*tp_iternext*/
7978 encoding_map_methods, /*tp_methods*/
7979 0, /*tp_members*/
7980 0, /*tp_getset*/
7981 0, /*tp_base*/
7982 0, /*tp_dict*/
7983 0, /*tp_descr_get*/
7984 0, /*tp_descr_set*/
7985 0, /*tp_dictoffset*/
7986 0, /*tp_init*/
7987 0, /*tp_alloc*/
7988 0, /*tp_new*/
7989 0, /*tp_free*/
7990 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007991};
7992
7993PyObject*
7994PyUnicode_BuildEncodingMap(PyObject* string)
7995{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007996 PyObject *result;
7997 struct encoding_map *mresult;
7998 int i;
7999 int need_dict = 0;
8000 unsigned char level1[32];
8001 unsigned char level2[512];
8002 unsigned char *mlevel1, *mlevel2, *mlevel3;
8003 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004 int kind;
8005 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008006 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008007 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008009 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010 PyErr_BadArgument();
8011 return NULL;
8012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 kind = PyUnicode_KIND(string);
8014 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008015 length = PyUnicode_GET_LENGTH(string);
8016 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017 memset(level1, 0xFF, sizeof level1);
8018 memset(level2, 0xFF, sizeof level2);
8019
8020 /* If there isn't a one-to-one mapping of NULL to \0,
8021 or if there are non-BMP characters, we need to use
8022 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008023 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008025 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008027 ch = PyUnicode_READ(kind, data, i);
8028 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 need_dict = 1;
8030 break;
8031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008032 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 /* unmapped character */
8034 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008035 l1 = ch >> 11;
8036 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037 if (level1[l1] == 0xFF)
8038 level1[l1] = count2++;
8039 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 }
8042
8043 if (count2 >= 0xFF || count3 >= 0xFF)
8044 need_dict = 1;
8045
8046 if (need_dict) {
8047 PyObject *result = PyDict_New();
8048 PyObject *key, *value;
8049 if (!result)
8050 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008051 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008052 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008053 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054 if (!key || !value)
8055 goto failed1;
8056 if (PyDict_SetItem(result, key, value) == -1)
8057 goto failed1;
8058 Py_DECREF(key);
8059 Py_DECREF(value);
8060 }
8061 return result;
8062 failed1:
8063 Py_XDECREF(key);
8064 Py_XDECREF(value);
8065 Py_DECREF(result);
8066 return NULL;
8067 }
8068
8069 /* Create a three-level trie */
8070 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8071 16*count2 + 128*count3 - 1);
8072 if (!result)
8073 return PyErr_NoMemory();
8074 PyObject_Init(result, &EncodingMapType);
8075 mresult = (struct encoding_map*)result;
8076 mresult->count2 = count2;
8077 mresult->count3 = count3;
8078 mlevel1 = mresult->level1;
8079 mlevel2 = mresult->level23;
8080 mlevel3 = mresult->level23 + 16*count2;
8081 memcpy(mlevel1, level1, 32);
8082 memset(mlevel2, 0xFF, 16*count2);
8083 memset(mlevel3, 0, 128*count3);
8084 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008085 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008087 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8088 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008089 /* unmapped character */
8090 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008091 o1 = ch>>11;
8092 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 i2 = 16*mlevel1[o1] + o2;
8094 if (mlevel2[i2] == 0xFF)
8095 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008096 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 i3 = 128*mlevel2[i2] + o3;
8098 mlevel3[i3] = i;
8099 }
8100 return result;
8101}
8102
8103static int
Victor Stinner22168992011-11-20 17:09:18 +01008104encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105{
8106 struct encoding_map *map = (struct encoding_map*)mapping;
8107 int l1 = c>>11;
8108 int l2 = (c>>7) & 0xF;
8109 int l3 = c & 0x7F;
8110 int i;
8111
Victor Stinner22168992011-11-20 17:09:18 +01008112 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 if (c == 0)
8115 return 0;
8116 /* level 1*/
8117 i = map->level1[l1];
8118 if (i == 0xFF) {
8119 return -1;
8120 }
8121 /* level 2*/
8122 i = map->level23[16*i+l2];
8123 if (i == 0xFF) {
8124 return -1;
8125 }
8126 /* level 3 */
8127 i = map->level23[16*map->count2 + 128*i + l3];
8128 if (i == 0) {
8129 return -1;
8130 }
8131 return i;
8132}
8133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008134/* Lookup the character ch in the mapping. If the character
8135 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008136 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008137static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008138charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139{
Christian Heimes217cfd12007-12-02 14:31:20 +00008140 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 PyObject *x;
8142
8143 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 x = PyObject_GetItem(mapping, w);
8146 Py_DECREF(w);
8147 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8149 /* No mapping found means: mapping is undefined. */
8150 PyErr_Clear();
8151 x = Py_None;
8152 Py_INCREF(x);
8153 return x;
8154 } else
8155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008157 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008159 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 long value = PyLong_AS_LONG(x);
8161 if (value < 0 || value > 255) {
8162 PyErr_SetString(PyExc_TypeError,
8163 "character mapping must be in range(256)");
8164 Py_DECREF(x);
8165 return NULL;
8166 }
8167 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008169 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 /* wrong return value */
8173 PyErr_Format(PyExc_TypeError,
8174 "character mapping must return integer, bytes or None, not %.400s",
8175 x->ob_type->tp_name);
8176 Py_DECREF(x);
8177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008178 }
8179}
8180
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008182charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8185 /* exponentially overallocate to minimize reallocations */
8186 if (requiredsize < 2*outsize)
8187 requiredsize = 2*outsize;
8188 if (_PyBytes_Resize(outobj, requiredsize))
8189 return -1;
8190 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008191}
8192
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008197 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 space is available. Return a new reference to the object that
8199 was put in the output buffer, or Py_None, if the mapping was undefined
8200 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008201 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008202static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008203charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008206 PyObject *rep;
8207 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008208 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209
Christian Heimes90aa7642007-12-19 02:45:37 +00008210 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 if (res == -1)
8214 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 if (outsize<requiredsize)
8216 if (charmapencode_resize(outobj, outpos, requiredsize))
8217 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008218 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 outstart[(*outpos)++] = (char)res;
8220 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 }
8222
8223 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 Py_DECREF(rep);
8228 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 if (PyLong_Check(rep)) {
8231 Py_ssize_t requiredsize = *outpos+1;
8232 if (outsize<requiredsize)
8233 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8234 Py_DECREF(rep);
8235 return enc_EXCEPTION;
8236 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008237 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 else {
8241 const char *repchars = PyBytes_AS_STRING(rep);
8242 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8243 Py_ssize_t requiredsize = *outpos+repsize;
8244 if (outsize<requiredsize)
8245 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8246 Py_DECREF(rep);
8247 return enc_EXCEPTION;
8248 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008249 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 memcpy(outstart + *outpos, repchars, repsize);
8251 *outpos += repsize;
8252 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008254 Py_DECREF(rep);
8255 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256}
8257
8258/* handle an error in PyUnicode_EncodeCharmap
8259 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008260static int
8261charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008262 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008264 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008265 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266{
8267 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008268 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008269 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008270 enum PyUnicode_Kind kind;
8271 void *data;
8272 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t collstartpos = *inpos;
8275 Py_ssize_t collendpos = *inpos+1;
8276 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 char *encoding = "charmap";
8278 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008280 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008281 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282
Benjamin Petersonbac79492012-01-14 13:34:47 -05008283 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008284 return -1;
8285 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 /* find all unencodable characters */
8287 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008289 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008290 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008291 val = encoding_map_lookup(ch, mapping);
8292 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 break;
8294 ++collendpos;
8295 continue;
8296 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008298 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8299 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 if (rep==NULL)
8301 return -1;
8302 else if (rep!=Py_None) {
8303 Py_DECREF(rep);
8304 break;
8305 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 }
8309 /* cache callback name lookup
8310 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008311 if (*error_handler == _Py_ERROR_UNKNOWN)
8312 *error_handler = get_error_handler(errors);
8313
8314 switch (*error_handler) {
8315 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008316 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008318
8319 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 x = charmapencode_output('?', mapping, res, respos);
8322 if (x==enc_EXCEPTION) {
8323 return -1;
8324 }
8325 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008326 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 return -1;
8328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008329 }
8330 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008331 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008332 *inpos = collendpos;
8333 break;
Victor Stinner50149202015-09-22 00:26:54 +02008334
8335 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008336 /* generate replacement (temporarily (mis)uses p) */
8337 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 char buffer[2+29+1+1];
8339 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008340 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 for (cp = buffer; *cp; ++cp) {
8342 x = charmapencode_output(*cp, mapping, res, respos);
8343 if (x==enc_EXCEPTION)
8344 return -1;
8345 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008346 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return -1;
8348 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008349 }
8350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008351 *inpos = collendpos;
8352 break;
Victor Stinner50149202015-09-22 00:26:54 +02008353
Benjamin Peterson14339b62009-01-31 16:36:08 +00008354 default:
Victor Stinner50149202015-09-22 00:26:54 +02008355 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008360 if (PyBytes_Check(repunicode)) {
8361 /* Directly copy bytes result to output. */
8362 Py_ssize_t outsize = PyBytes_Size(*res);
8363 Py_ssize_t requiredsize;
8364 repsize = PyBytes_Size(repunicode);
8365 requiredsize = *respos + repsize;
8366 if (requiredsize > outsize)
8367 /* Make room for all additional bytes. */
8368 if (charmapencode_resize(res, respos, requiredsize)) {
8369 Py_DECREF(repunicode);
8370 return -1;
8371 }
8372 memcpy(PyBytes_AsString(*res) + *respos,
8373 PyBytes_AsString(repunicode), repsize);
8374 *respos += repsize;
8375 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008376 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008377 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008378 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008380 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008381 Py_DECREF(repunicode);
8382 return -1;
8383 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008384 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008385 data = PyUnicode_DATA(repunicode);
8386 kind = PyUnicode_KIND(repunicode);
8387 for (index = 0; index < repsize; index++) {
8388 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8389 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008391 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return -1;
8393 }
8394 else if (x==enc_FAILED) {
8395 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008396 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return -1;
8398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008399 }
8400 *inpos = newpos;
8401 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 }
8403 return 0;
8404}
8405
Alexander Belopolsky40018472011-02-26 01:02:56 +00008406PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008407_PyUnicode_EncodeCharmap(PyObject *unicode,
8408 PyObject *mapping,
8409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 /* output object */
8412 PyObject *res = NULL;
8413 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008414 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008415 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008417 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008418 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008420 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008421 void *data;
8422 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423
Benjamin Petersonbac79492012-01-14 13:34:47 -05008424 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008425 return NULL;
8426 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008427 data = PyUnicode_DATA(unicode);
8428 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 /* Default to Latin-1 */
8431 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008432 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 /* allocate enough for a simple encoding without
8435 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008436 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 if (res == NULL)
8438 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008439 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008443 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008445 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 if (x==enc_EXCEPTION) /* error */
8447 goto onError;
8448 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008451 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 &res, &respos)) {
8453 goto onError;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 else
8457 /* done with this character => adjust input position */
8458 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008462 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008463 if (_PyBytes_Resize(&res, respos) < 0)
8464 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008465
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008467 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468 return res;
8469
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 Py_XDECREF(res);
8472 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008473 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474 return NULL;
8475}
8476
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008477/* Deprecated */
8478PyObject *
8479PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8480 Py_ssize_t size,
8481 PyObject *mapping,
8482 const char *errors)
8483{
8484 PyObject *result;
8485 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8486 if (unicode == NULL)
8487 return NULL;
8488 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8489 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008490 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491}
8492
Alexander Belopolsky40018472011-02-26 01:02:56 +00008493PyObject *
8494PyUnicode_AsCharmapString(PyObject *unicode,
8495 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496{
8497 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 PyErr_BadArgument();
8499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008501 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502}
8503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008505static void
8506make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008508 Py_ssize_t startpos, Py_ssize_t endpos,
8509 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 *exceptionObject = _PyUnicodeTranslateError_Create(
8513 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 }
8515 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8517 goto onError;
8518 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8519 goto onError;
8520 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8521 goto onError;
8522 return;
8523 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008524 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525 }
8526}
8527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528/* error handling callback helper:
8529 build arguments, call the callback and check the arguments,
8530 put the result into newpos and return the replacement string, which
8531 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532static PyObject *
8533unicode_translate_call_errorhandler(const char *errors,
8534 PyObject **errorHandler,
8535 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008537 Py_ssize_t startpos, Py_ssize_t endpos,
8538 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008540 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008542 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 PyObject *restuple;
8544 PyObject *resunicode;
8545
8546 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 }
8551
8552 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556
8557 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008562 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 Py_DECREF(restuple);
8564 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 }
8566 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 &resunicode, &i_newpos)) {
8568 Py_DECREF(restuple);
8569 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008571 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008573 else
8574 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008576 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 Py_DECREF(restuple);
8578 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 Py_INCREF(resunicode);
8581 Py_DECREF(restuple);
8582 return resunicode;
8583}
8584
8585/* Lookup the character ch in the mapping and put the result in result,
8586 which must be decrefed by the caller.
8587 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008588static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590{
Christian Heimes217cfd12007-12-02 14:31:20 +00008591 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 PyObject *x;
8593
8594 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 x = PyObject_GetItem(mapping, w);
8597 Py_DECREF(w);
8598 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8600 /* No mapping found means: use 1:1 mapping. */
8601 PyErr_Clear();
8602 *result = NULL;
8603 return 0;
8604 } else
8605 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606 }
8607 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 *result = x;
8609 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008611 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008613 if (value < 0 || value > MAX_UNICODE) {
8614 PyErr_Format(PyExc_ValueError,
8615 "character mapping must be in range(0x%x)",
8616 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 Py_DECREF(x);
8618 return -1;
8619 }
8620 *result = x;
8621 return 0;
8622 }
8623 else if (PyUnicode_Check(x)) {
8624 *result = x;
8625 return 0;
8626 }
8627 else {
8628 /* wrong return value */
8629 PyErr_SetString(PyExc_TypeError,
8630 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 Py_DECREF(x);
8632 return -1;
8633 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634}
Victor Stinner1194ea02014-04-04 19:37:40 +02008635
8636/* lookup the character, write the result into the writer.
8637 Return 1 if the result was written into the writer, return 0 if the mapping
8638 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008640charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8641 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642{
Victor Stinner1194ea02014-04-04 19:37:40 +02008643 PyObject *item;
8644
8645 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008647
8648 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008650 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008653 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008655
8656 if (item == Py_None) {
8657 Py_DECREF(item);
8658 return 0;
8659 }
8660
8661 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008662 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8663 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8664 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008665 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8666 Py_DECREF(item);
8667 return -1;
8668 }
8669 Py_DECREF(item);
8670 return 1;
8671 }
8672
8673 if (!PyUnicode_Check(item)) {
8674 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008676 }
8677
8678 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8679 Py_DECREF(item);
8680 return -1;
8681 }
8682
8683 Py_DECREF(item);
8684 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685}
8686
Victor Stinner89a76ab2014-04-05 11:44:04 +02008687static int
8688unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8689 Py_UCS1 *translate)
8690{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008691 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008692 int ret = 0;
8693
Victor Stinner89a76ab2014-04-05 11:44:04 +02008694 if (charmaptranslate_lookup(ch, mapping, &item)) {
8695 return -1;
8696 }
8697
8698 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008699 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008700 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008701 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008702 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008703 /* not found => default to 1:1 mapping */
8704 translate[ch] = ch;
8705 return 1;
8706 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008707 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008708 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008709 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8710 used it */
8711 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008712 /* invalid character or character outside ASCII:
8713 skip the fast translate */
8714 goto exit;
8715 }
8716 translate[ch] = (Py_UCS1)replace;
8717 }
8718 else if (PyUnicode_Check(item)) {
8719 Py_UCS4 replace;
8720
8721 if (PyUnicode_READY(item) == -1) {
8722 Py_DECREF(item);
8723 return -1;
8724 }
8725 if (PyUnicode_GET_LENGTH(item) != 1)
8726 goto exit;
8727
8728 replace = PyUnicode_READ_CHAR(item, 0);
8729 if (replace > 127)
8730 goto exit;
8731 translate[ch] = (Py_UCS1)replace;
8732 }
8733 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008734 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008735 goto exit;
8736 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008737 ret = 1;
8738
Benjamin Peterson1365de72014-04-07 20:15:41 -04008739 exit:
8740 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008741 return ret;
8742}
8743
8744/* Fast path for ascii => ascii translation. Return 1 if the whole string
8745 was translated into writer, return 0 if the input string was partially
8746 translated into writer, raise an exception and return -1 on error. */
8747static int
8748unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008749 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008750{
Victor Stinner872b2912014-04-05 14:27:07 +02008751 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008752 Py_ssize_t len;
8753 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008754 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008755
8756 if (PyUnicode_READY(input) == -1)
8757 return -1;
8758 if (!PyUnicode_IS_ASCII(input))
8759 return 0;
8760 len = PyUnicode_GET_LENGTH(input);
8761
Victor Stinner872b2912014-04-05 14:27:07 +02008762 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008763
8764 in = PyUnicode_1BYTE_DATA(input);
8765 end = in + len;
8766
8767 assert(PyUnicode_IS_ASCII(writer->buffer));
8768 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8769 out = PyUnicode_1BYTE_DATA(writer->buffer);
8770
Victor Stinner872b2912014-04-05 14:27:07 +02008771 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008772 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008773 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008774 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008775 int translate = unicode_fast_translate_lookup(mapping, ch,
8776 ascii_table);
8777 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008778 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008779 if (translate == 0)
8780 goto exit;
8781 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008782 }
Victor Stinner872b2912014-04-05 14:27:07 +02008783 if (ch2 == 0xfe) {
8784 if (ignore)
8785 continue;
8786 goto exit;
8787 }
8788 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008789 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008790 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008791 }
Victor Stinner872b2912014-04-05 14:27:07 +02008792 res = 1;
8793
8794exit:
8795 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8796 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008797}
8798
Victor Stinner3222da22015-10-01 22:07:32 +02008799static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800_PyUnicode_TranslateCharmap(PyObject *input,
8801 PyObject *mapping,
8802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008805 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 Py_ssize_t size, i;
8807 int kind;
8808 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008809 _PyUnicodeWriter writer;
8810 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 char *reason = "character maps to <undefined>";
8812 PyObject *errorHandler = NULL;
8813 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008814 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 PyErr_BadArgument();
8819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 if (PyUnicode_READY(input) == -1)
8823 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008824 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 kind = PyUnicode_KIND(input);
8826 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827
8828 if (size == 0) {
8829 Py_INCREF(input);
8830 return input;
8831 }
8832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833 /* allocate enough for a simple 1:1 translation without
8834 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008835 _PyUnicodeWriter_Init(&writer);
8836 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838
Victor Stinner872b2912014-04-05 14:27:07 +02008839 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8840
8841 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008842 if (res < 0) {
8843 _PyUnicodeWriter_Dealloc(&writer);
8844 return NULL;
8845 }
8846 if (res == 1)
8847 return _PyUnicodeWriter_Finish(&writer);
8848
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008852 int translate;
8853 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8854 Py_ssize_t newpos;
8855 /* startpos for collecting untranslatable chars */
8856 Py_ssize_t collstart;
8857 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008858 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859
Victor Stinner1194ea02014-04-04 19:37:40 +02008860 ch = PyUnicode_READ(kind, data, i);
8861 translate = charmaptranslate_output(ch, mapping, &writer);
8862 if (translate < 0)
8863 goto onError;
8864
8865 if (translate != 0) {
8866 /* it worked => adjust input pointer */
8867 ++i;
8868 continue;
8869 }
8870
8871 /* untranslatable character */
8872 collstart = i;
8873 collend = i+1;
8874
8875 /* find all untranslatable characters */
8876 while (collend < size) {
8877 PyObject *x;
8878 ch = PyUnicode_READ(kind, data, collend);
8879 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008880 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008881 Py_XDECREF(x);
8882 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008884 ++collend;
8885 }
8886
8887 if (ignore) {
8888 i = collend;
8889 }
8890 else {
8891 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8892 reason, input, &exc,
8893 collstart, collend, &newpos);
8894 if (repunicode == NULL)
8895 goto onError;
8896 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008898 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008899 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008900 Py_DECREF(repunicode);
8901 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008902 }
8903 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008904 Py_XDECREF(exc);
8905 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008906 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008909 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 Py_XDECREF(exc);
8911 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 return NULL;
8913}
8914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915/* Deprecated. Use PyUnicode_Translate instead. */
8916PyObject *
8917PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8918 Py_ssize_t size,
8919 PyObject *mapping,
8920 const char *errors)
8921{
Christian Heimes5f520f42012-09-11 14:03:25 +02008922 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8924 if (!unicode)
8925 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008926 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8927 Py_DECREF(unicode);
8928 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929}
8930
Alexander Belopolsky40018472011-02-26 01:02:56 +00008931PyObject *
8932PyUnicode_Translate(PyObject *str,
8933 PyObject *mapping,
8934 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935{
8936 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008937
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 str = PyUnicode_FromObject(str);
8939 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008940 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 Py_DECREF(str);
8943 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944}
Tim Petersced69f82003-09-16 20:30:58 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008947fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948{
8949 /* No need to call PyUnicode_READY(self) because this function is only
8950 called as a callback from fixup() which does it already. */
8951 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8952 const int kind = PyUnicode_KIND(self);
8953 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008954 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008955 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 Py_ssize_t i;
8957
8958 for (i = 0; i < len; ++i) {
8959 ch = PyUnicode_READ(kind, data, i);
8960 fixed = 0;
8961 if (ch > 127) {
8962 if (Py_UNICODE_ISSPACE(ch))
8963 fixed = ' ';
8964 else {
8965 const int decimal = Py_UNICODE_TODECIMAL(ch);
8966 if (decimal >= 0)
8967 fixed = '0' + decimal;
8968 }
8969 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008970 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008971 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 PyUnicode_WRITE(kind, data, i, fixed);
8973 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008974 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008975 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 }
8978
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008979 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980}
8981
8982PyObject *
8983_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8984{
8985 if (!PyUnicode_Check(unicode)) {
8986 PyErr_BadInternalCall();
8987 return NULL;
8988 }
8989 if (PyUnicode_READY(unicode) == -1)
8990 return NULL;
8991 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8992 /* If the string is already ASCII, just return the same string */
8993 Py_INCREF(unicode);
8994 return unicode;
8995 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008996 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997}
8998
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008999PyObject *
9000PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9001 Py_ssize_t length)
9002{
Victor Stinnerf0124502011-11-21 23:12:56 +01009003 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009004 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009005 Py_UCS4 maxchar;
9006 enum PyUnicode_Kind kind;
9007 void *data;
9008
Victor Stinner99d7ad02012-02-22 13:37:39 +01009009 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009010 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009011 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009012 if (ch > 127) {
9013 int decimal = Py_UNICODE_TODECIMAL(ch);
9014 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009015 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009016 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009017 }
9018 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009019
9020 /* Copy to a new string */
9021 decimal = PyUnicode_New(length, maxchar);
9022 if (decimal == NULL)
9023 return decimal;
9024 kind = PyUnicode_KIND(decimal);
9025 data = PyUnicode_DATA(decimal);
9026 /* Iterate over code points */
9027 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009028 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009029 if (ch > 127) {
9030 int decimal = Py_UNICODE_TODECIMAL(ch);
9031 if (decimal >= 0)
9032 ch = '0' + decimal;
9033 }
9034 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009036 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009037}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009038/* --- Decimal Encoder ---------------------------------------------------- */
9039
Alexander Belopolsky40018472011-02-26 01:02:56 +00009040int
9041PyUnicode_EncodeDecimal(Py_UNICODE *s,
9042 Py_ssize_t length,
9043 char *output,
9044 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009045{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009046 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009047 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009048 enum PyUnicode_Kind kind;
9049 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009050
9051 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009052 PyErr_BadArgument();
9053 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009054 }
9055
Victor Stinner42bf7752011-11-21 22:52:58 +01009056 unicode = PyUnicode_FromUnicode(s, length);
9057 if (unicode == NULL)
9058 return -1;
9059
Benjamin Petersonbac79492012-01-14 13:34:47 -05009060 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009061 Py_DECREF(unicode);
9062 return -1;
9063 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009064 kind = PyUnicode_KIND(unicode);
9065 data = PyUnicode_DATA(unicode);
9066
Victor Stinnerb84d7232011-11-22 01:50:07 +01009067 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009068 PyObject *exc;
9069 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009071 Py_ssize_t startpos;
9072
9073 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009074
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009076 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009077 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009079 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 decimal = Py_UNICODE_TODECIMAL(ch);
9081 if (decimal >= 0) {
9082 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009083 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 continue;
9085 }
9086 if (0 < ch && ch < 256) {
9087 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009088 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 continue;
9090 }
Victor Stinner6345be92011-11-25 20:09:01 +01009091
Victor Stinner42bf7752011-11-21 22:52:58 +01009092 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009093 exc = NULL;
9094 raise_encode_exception(&exc, "decimal", unicode,
9095 startpos, startpos+1,
9096 "invalid decimal Unicode string");
9097 Py_XDECREF(exc);
9098 Py_DECREF(unicode);
9099 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009100 }
9101 /* 0-terminate the output string */
9102 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009103 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009104 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009105}
9106
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107/* --- Helpers ------------------------------------------------------------ */
9108
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009109/* helper macro to fixup start/end slice values */
9110#define ADJUST_INDICES(start, end, len) \
9111 if (end > len) \
9112 end = len; \
9113 else if (end < 0) { \
9114 end += len; \
9115 if (end < 0) \
9116 end = 0; \
9117 } \
9118 if (start < 0) { \
9119 start += len; \
9120 if (start < 0) \
9121 start = 0; \
9122 }
9123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009125any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 Py_ssize_t start,
9127 Py_ssize_t end)
9128{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009129 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 void *buf1, *buf2;
9131 Py_ssize_t len1, len2, result;
9132
9133 kind1 = PyUnicode_KIND(s1);
9134 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009135 if (kind1 < kind2)
9136 return -1;
9137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 len1 = PyUnicode_GET_LENGTH(s1);
9139 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009140 ADJUST_INDICES(start, end, len1);
9141 if (end - start < len2)
9142 return -1;
9143
9144 buf1 = PyUnicode_DATA(s1);
9145 buf2 = PyUnicode_DATA(s2);
9146 if (len2 == 1) {
9147 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9148 result = findchar((const char *)buf1 + kind1*start,
9149 kind1, end - start, ch, direction);
9150 if (result == -1)
9151 return -1;
9152 else
9153 return start + result;
9154 }
9155
9156 if (kind2 != kind1) {
9157 buf2 = _PyUnicode_AsKind(s2, kind1);
9158 if (!buf2)
9159 return -2;
9160 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161
Victor Stinner794d5672011-10-10 03:21:36 +02009162 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009163 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009164 case PyUnicode_1BYTE_KIND:
9165 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9166 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9167 else
9168 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9169 break;
9170 case PyUnicode_2BYTE_KIND:
9171 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9172 break;
9173 case PyUnicode_4BYTE_KIND:
9174 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9175 break;
9176 default:
9177 assert(0); result = -2;
9178 }
9179 }
9180 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009181 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009182 case PyUnicode_1BYTE_KIND:
9183 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9184 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9185 else
9186 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9187 break;
9188 case PyUnicode_2BYTE_KIND:
9189 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9190 break;
9191 case PyUnicode_4BYTE_KIND:
9192 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9193 break;
9194 default:
9195 assert(0); result = -2;
9196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 }
9198
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009199 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 PyMem_Free(buf2);
9201
9202 return result;
9203}
9204
9205Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009206_PyUnicode_InsertThousandsGrouping(
9207 PyObject *unicode, Py_ssize_t index,
9208 Py_ssize_t n_buffer,
9209 void *digits, Py_ssize_t n_digits,
9210 Py_ssize_t min_width,
9211 const char *grouping, PyObject *thousands_sep,
9212 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213{
Victor Stinner41a863c2012-02-24 00:37:51 +01009214 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009215 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009216 Py_ssize_t thousands_sep_len;
9217 Py_ssize_t len;
9218
9219 if (unicode != NULL) {
9220 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009221 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009222 }
9223 else {
9224 kind = PyUnicode_1BYTE_KIND;
9225 data = NULL;
9226 }
9227 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9228 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9229 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9230 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009231 if (thousands_sep_kind < kind) {
9232 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9233 if (!thousands_sep_data)
9234 return -1;
9235 }
9236 else {
9237 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9238 if (!data)
9239 return -1;
9240 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009241 }
9242
Benjamin Petersonead6b532011-12-20 17:23:42 -06009243 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009245 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009246 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009247 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009248 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009249 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009250 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009251 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009252 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009253 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009254 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009257 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009258 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009259 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009260 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009261 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009263 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009264 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009266 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 break;
9268 default:
9269 assert(0);
9270 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009272 if (unicode != NULL && thousands_sep_kind != kind) {
9273 if (thousands_sep_kind < kind)
9274 PyMem_Free(thousands_sep_data);
9275 else
9276 PyMem_Free(data);
9277 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009278 if (unicode == NULL) {
9279 *maxchar = 127;
9280 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009281 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009282 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009283 }
9284 }
9285 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286}
9287
9288
Alexander Belopolsky40018472011-02-26 01:02:56 +00009289Py_ssize_t
9290PyUnicode_Count(PyObject *str,
9291 PyObject *substr,
9292 Py_ssize_t start,
9293 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009295 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009296 PyObject* str_obj;
9297 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009298 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 void *buf1 = NULL, *buf2 = NULL;
9300 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009301
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009302 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009303 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009305 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009306 if (!sub_obj) {
9307 Py_DECREF(str_obj);
9308 return -1;
9309 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009310 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009311 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 Py_DECREF(str_obj);
9313 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 }
Tim Petersced69f82003-09-16 20:30:58 +00009315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 kind1 = PyUnicode_KIND(str_obj);
9317 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009318 if (kind1 < kind2) {
9319 Py_DECREF(sub_obj);
9320 Py_DECREF(str_obj);
9321 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009322 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 len1 = PyUnicode_GET_LENGTH(str_obj);
9325 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009327 if (end - start < len2) {
9328 Py_DECREF(sub_obj);
9329 Py_DECREF(str_obj);
9330 return 0;
9331 }
9332
9333 buf1 = PyUnicode_DATA(str_obj);
9334 buf2 = PyUnicode_DATA(sub_obj);
9335 if (kind2 != kind1) {
9336 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9337 if (!buf2)
9338 goto onError;
9339 }
9340
9341 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009343 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9344 result = asciilib_count(
9345 ((Py_UCS1*)buf1) + start, end - start,
9346 buf2, len2, PY_SSIZE_T_MAX
9347 );
9348 else
9349 result = ucs1lib_count(
9350 ((Py_UCS1*)buf1) + start, end - start,
9351 buf2, len2, PY_SSIZE_T_MAX
9352 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 break;
9354 case PyUnicode_2BYTE_KIND:
9355 result = ucs2lib_count(
9356 ((Py_UCS2*)buf1) + start, end - start,
9357 buf2, len2, PY_SSIZE_T_MAX
9358 );
9359 break;
9360 case PyUnicode_4BYTE_KIND:
9361 result = ucs4lib_count(
9362 ((Py_UCS4*)buf1) + start, end - start,
9363 buf2, len2, PY_SSIZE_T_MAX
9364 );
9365 break;
9366 default:
9367 assert(0); result = 0;
9368 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009369
9370 Py_DECREF(sub_obj);
9371 Py_DECREF(str_obj);
9372
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009373 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 PyMem_Free(buf2);
9375
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 onError:
9378 Py_DECREF(sub_obj);
9379 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009380 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 PyMem_Free(buf2);
9382 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383}
9384
Alexander Belopolsky40018472011-02-26 01:02:56 +00009385Py_ssize_t
9386PyUnicode_Find(PyObject *str,
9387 PyObject *sub,
9388 Py_ssize_t start,
9389 Py_ssize_t end,
9390 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009392 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009393
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009395 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009397 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009398 if (!sub) {
9399 Py_DECREF(str);
9400 return -2;
9401 }
9402 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9403 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 Py_DECREF(str);
9405 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 }
Tim Petersced69f82003-09-16 20:30:58 +00009407
Victor Stinner794d5672011-10-10 03:21:36 +02009408 result = any_find_slice(direction,
9409 str, sub, start, end
9410 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009411
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009413 Py_DECREF(sub);
9414
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 return result;
9416}
9417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418Py_ssize_t
9419PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9420 Py_ssize_t start, Py_ssize_t end,
9421 int direction)
9422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009424 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 if (PyUnicode_READY(str) == -1)
9426 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009427 if (start < 0 || end < 0) {
9428 PyErr_SetString(PyExc_IndexError, "string index out of range");
9429 return -2;
9430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 if (end > PyUnicode_GET_LENGTH(str))
9432 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (start >= end)
9434 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009436 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9437 kind, end-start, ch, direction);
9438 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009440 else
9441 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442}
9443
Alexander Belopolsky40018472011-02-26 01:02:56 +00009444static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009445tailmatch(PyObject *self,
9446 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009447 Py_ssize_t start,
9448 Py_ssize_t end,
9449 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 int kind_self;
9452 int kind_sub;
9453 void *data_self;
9454 void *data_sub;
9455 Py_ssize_t offset;
9456 Py_ssize_t i;
9457 Py_ssize_t end_sub;
9458
9459 if (PyUnicode_READY(self) == -1 ||
9460 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009461 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9464 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009468 if (PyUnicode_GET_LENGTH(substring) == 0)
9469 return 1;
9470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 kind_self = PyUnicode_KIND(self);
9472 data_self = PyUnicode_DATA(self);
9473 kind_sub = PyUnicode_KIND(substring);
9474 data_sub = PyUnicode_DATA(substring);
9475 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9476
9477 if (direction > 0)
9478 offset = end;
9479 else
9480 offset = start;
9481
9482 if (PyUnicode_READ(kind_self, data_self, offset) ==
9483 PyUnicode_READ(kind_sub, data_sub, 0) &&
9484 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9485 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9486 /* If both are of the same kind, memcmp is sufficient */
9487 if (kind_self == kind_sub) {
9488 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009489 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 data_sub,
9491 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009492 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 }
9494 /* otherwise we have to compare each character by first accesing it */
9495 else {
9496 /* We do not need to compare 0 and len(substring)-1 because
9497 the if statement above ensured already that they are equal
9498 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 for (i = 1; i < end_sub; ++i) {
9500 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9501 PyUnicode_READ(kind_sub, data_sub, i))
9502 return 0;
9503 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 }
9507
9508 return 0;
9509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511Py_ssize_t
9512PyUnicode_Tailmatch(PyObject *str,
9513 PyObject *substr,
9514 Py_ssize_t start,
9515 Py_ssize_t end,
9516 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009518 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009519
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520 str = PyUnicode_FromObject(str);
9521 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 substr = PyUnicode_FromObject(substr);
9524 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 Py_DECREF(str);
9526 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 }
Tim Petersced69f82003-09-16 20:30:58 +00009528
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009529 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009530 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 Py_DECREF(str);
9532 Py_DECREF(substr);
9533 return result;
9534}
9535
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536/* Apply fixfct filter to the Unicode object self and return a
9537 reference to the modified object */
9538
Alexander Belopolsky40018472011-02-26 01:02:56 +00009539static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009540fixup(PyObject *self,
9541 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 PyObject *u;
9544 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009545 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009547 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009550 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 /* fix functions return the new maximum character in a string,
9553 if the kind of the resulting unicode object does not change,
9554 everything is fine. Otherwise we need to change the string kind
9555 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009556 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009557
9558 if (maxchar_new == 0) {
9559 /* no changes */;
9560 if (PyUnicode_CheckExact(self)) {
9561 Py_DECREF(u);
9562 Py_INCREF(self);
9563 return self;
9564 }
9565 else
9566 return u;
9567 }
9568
Victor Stinnere6abb482012-05-02 01:15:40 +02009569 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570
Victor Stinnereaab6042011-12-11 22:22:39 +01009571 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009573
9574 /* In case the maximum character changed, we need to
9575 convert the string to the new category. */
9576 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9577 if (v == NULL) {
9578 Py_DECREF(u);
9579 return NULL;
9580 }
9581 if (maxchar_new > maxchar_old) {
9582 /* If the maxchar increased so that the kind changed, not all
9583 characters are representable anymore and we need to fix the
9584 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009585 _PyUnicode_FastCopyCharacters(v, 0,
9586 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009587 maxchar_old = fixfct(v);
9588 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 }
9590 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009591 _PyUnicode_FastCopyCharacters(v, 0,
9592 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009594 Py_DECREF(u);
9595 assert(_PyUnicode_CheckConsistency(v, 1));
9596 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597}
9598
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009599static PyObject *
9600ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009602 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9603 char *resdata, *data = PyUnicode_DATA(self);
9604 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009605
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009606 res = PyUnicode_New(len, 127);
9607 if (res == NULL)
9608 return NULL;
9609 resdata = PyUnicode_DATA(res);
9610 if (lower)
9611 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009613 _Py_bytes_upper(resdata, data, len);
9614 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615}
9616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009620 Py_ssize_t j;
9621 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009622 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009623 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009624
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009625 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9626
9627 where ! is a negation and \p{xxx} is a character with property xxx.
9628 */
9629 for (j = i - 1; j >= 0; j--) {
9630 c = PyUnicode_READ(kind, data, j);
9631 if (!_PyUnicode_IsCaseIgnorable(c))
9632 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009634 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9635 if (final_sigma) {
9636 for (j = i + 1; j < length; j++) {
9637 c = PyUnicode_READ(kind, data, j);
9638 if (!_PyUnicode_IsCaseIgnorable(c))
9639 break;
9640 }
9641 final_sigma = j == length || !_PyUnicode_IsCased(c);
9642 }
9643 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644}
9645
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646static int
9647lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9648 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650 /* Obscure special case. */
9651 if (c == 0x3A3) {
9652 mapped[0] = handle_capital_sigma(kind, data, length, i);
9653 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009655 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656}
9657
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658static Py_ssize_t
9659do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 Py_ssize_t i, k = 0;
9662 int n_res, j;
9663 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009664
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 c = PyUnicode_READ(kind, data, 0);
9666 n_res = _PyUnicode_ToUpperFull(c, mapped);
9667 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009668 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 for (i = 1; i < length; i++) {
9672 c = PyUnicode_READ(kind, data, i);
9673 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9674 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009675 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009677 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009678 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680}
9681
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682static Py_ssize_t
9683do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9684 Py_ssize_t i, k = 0;
9685
9686 for (i = 0; i < length; i++) {
9687 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9688 int n_res, j;
9689 if (Py_UNICODE_ISUPPER(c)) {
9690 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9691 }
9692 else if (Py_UNICODE_ISLOWER(c)) {
9693 n_res = _PyUnicode_ToUpperFull(c, mapped);
9694 }
9695 else {
9696 n_res = 1;
9697 mapped[0] = c;
9698 }
9699 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009700 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 res[k++] = mapped[j];
9702 }
9703 }
9704 return k;
9705}
9706
9707static Py_ssize_t
9708do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9709 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 Py_ssize_t i, k = 0;
9712
9713 for (i = 0; i < length; i++) {
9714 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9715 int n_res, j;
9716 if (lower)
9717 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9718 else
9719 n_res = _PyUnicode_ToUpperFull(c, mapped);
9720 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009721 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 res[k++] = mapped[j];
9723 }
9724 }
9725 return k;
9726}
9727
9728static Py_ssize_t
9729do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9730{
9731 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9732}
9733
9734static Py_ssize_t
9735do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9736{
9737 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9738}
9739
Benjamin Petersone51757f2012-01-12 21:10:29 -05009740static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009741do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9742{
9743 Py_ssize_t i, k = 0;
9744
9745 for (i = 0; i < length; i++) {
9746 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9747 Py_UCS4 mapped[3];
9748 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9749 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009750 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009751 res[k++] = mapped[j];
9752 }
9753 }
9754 return k;
9755}
9756
9757static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009758do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9759{
9760 Py_ssize_t i, k = 0;
9761 int previous_is_cased;
9762
9763 previous_is_cased = 0;
9764 for (i = 0; i < length; i++) {
9765 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9766 Py_UCS4 mapped[3];
9767 int n_res, j;
9768
9769 if (previous_is_cased)
9770 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9771 else
9772 n_res = _PyUnicode_ToTitleFull(c, mapped);
9773
9774 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009775 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009776 res[k++] = mapped[j];
9777 }
9778
9779 previous_is_cased = _PyUnicode_IsCased(c);
9780 }
9781 return k;
9782}
9783
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009784static PyObject *
9785case_operation(PyObject *self,
9786 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9787{
9788 PyObject *res = NULL;
9789 Py_ssize_t length, newlength = 0;
9790 int kind, outkind;
9791 void *data, *outdata;
9792 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9793
Benjamin Petersoneea48462012-01-16 14:28:50 -05009794 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795
9796 kind = PyUnicode_KIND(self);
9797 data = PyUnicode_DATA(self);
9798 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009799 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009800 PyErr_SetString(PyExc_OverflowError, "string is too long");
9801 return NULL;
9802 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009803 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009804 if (tmp == NULL)
9805 return PyErr_NoMemory();
9806 newlength = perform(kind, data, length, tmp, &maxchar);
9807 res = PyUnicode_New(newlength, maxchar);
9808 if (res == NULL)
9809 goto leave;
9810 tmpend = tmp + newlength;
9811 outdata = PyUnicode_DATA(res);
9812 outkind = PyUnicode_KIND(res);
9813 switch (outkind) {
9814 case PyUnicode_1BYTE_KIND:
9815 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9816 break;
9817 case PyUnicode_2BYTE_KIND:
9818 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9819 break;
9820 case PyUnicode_4BYTE_KIND:
9821 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9822 break;
9823 default:
9824 assert(0);
9825 break;
9826 }
9827 leave:
9828 PyMem_FREE(tmp);
9829 return res;
9830}
9831
Tim Peters8ce9f162004-08-27 01:49:32 +00009832PyObject *
9833PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009836 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009838 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009839 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9840 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009841 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009843 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009845 int use_memcpy;
9846 unsigned char *res_data = NULL, *sep_data = NULL;
9847 PyObject *last_obj;
9848 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009850 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009851 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009852 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009853 }
9854
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009855 /* NOTE: the following code can't call back into Python code,
9856 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009857 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009858
Tim Peters05eba1f2004-08-27 21:32:02 +00009859 seqlen = PySequence_Fast_GET_SIZE(fseq);
9860 /* If empty sequence, return u"". */
9861 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009862 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009863 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009864 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009865
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009867 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009868 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009869 if (seqlen == 1) {
9870 if (PyUnicode_CheckExact(items[0])) {
9871 res = items[0];
9872 Py_INCREF(res);
9873 Py_DECREF(fseq);
9874 return res;
9875 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009876 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009877 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009878 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009879 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009880 /* Set up sep and seplen */
9881 if (separator == NULL) {
9882 /* fall back to a blank space separator */
9883 sep = PyUnicode_FromOrdinal(' ');
9884 if (!sep)
9885 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009886 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009887 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009888 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009889 else {
9890 if (!PyUnicode_Check(separator)) {
9891 PyErr_Format(PyExc_TypeError,
9892 "separator: expected str instance,"
9893 " %.80s found",
9894 Py_TYPE(separator)->tp_name);
9895 goto onError;
9896 }
9897 if (PyUnicode_READY(separator))
9898 goto onError;
9899 sep = separator;
9900 seplen = PyUnicode_GET_LENGTH(separator);
9901 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9902 /* inc refcount to keep this code path symmetric with the
9903 above case of a blank separator */
9904 Py_INCREF(sep);
9905 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009906 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 }
9908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009909 /* There are at least two things to join, or else we have a subclass
9910 * of str in the sequence.
9911 * Do a pre-pass to figure out the total amount of space we'll
9912 * need (sz), and see whether all argument are strings.
9913 */
9914 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009915#ifdef Py_DEBUG
9916 use_memcpy = 0;
9917#else
9918 use_memcpy = 1;
9919#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009920 for (i = 0; i < seqlen; i++) {
9921 const Py_ssize_t old_sz = sz;
9922 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 if (!PyUnicode_Check(item)) {
9924 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009925 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 " %.80s found",
9927 i, Py_TYPE(item)->tp_name);
9928 goto onError;
9929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (PyUnicode_READY(item) == -1)
9931 goto onError;
9932 sz += PyUnicode_GET_LENGTH(item);
9933 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009934 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 if (i != 0)
9936 sz += seplen;
9937 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9938 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009940 goto onError;
9941 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009942 if (use_memcpy && last_obj != NULL) {
9943 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9944 use_memcpy = 0;
9945 }
9946 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009947 }
Tim Petersced69f82003-09-16 20:30:58 +00009948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009950 if (res == NULL)
9951 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009952
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009954#ifdef Py_DEBUG
9955 use_memcpy = 0;
9956#else
9957 if (use_memcpy) {
9958 res_data = PyUnicode_1BYTE_DATA(res);
9959 kind = PyUnicode_KIND(res);
9960 if (seplen != 0)
9961 sep_data = PyUnicode_1BYTE_DATA(sep);
9962 }
9963#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009964 if (use_memcpy) {
9965 for (i = 0; i < seqlen; ++i) {
9966 Py_ssize_t itemlen;
9967 item = items[i];
9968
9969 /* Copy item, and maybe the separator. */
9970 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 Py_MEMCPY(res_data,
9972 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009973 kind * seplen);
9974 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009976
9977 itemlen = PyUnicode_GET_LENGTH(item);
9978 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009979 Py_MEMCPY(res_data,
9980 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009981 kind * itemlen);
9982 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009983 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009984 }
9985 assert(res_data == PyUnicode_1BYTE_DATA(res)
9986 + kind * PyUnicode_GET_LENGTH(res));
9987 }
9988 else {
9989 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9990 Py_ssize_t itemlen;
9991 item = items[i];
9992
9993 /* Copy item, and maybe the separator. */
9994 if (i && seplen != 0) {
9995 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9996 res_offset += seplen;
9997 }
9998
9999 itemlen = PyUnicode_GET_LENGTH(item);
10000 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010001 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010002 res_offset += itemlen;
10003 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010004 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010005 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010006 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010007
Tim Peters05eba1f2004-08-27 21:32:02 +000010008 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010010 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012
Benjamin Peterson29060642009-01-31 22:14:21 +000010013 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010014 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010016 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 return NULL;
10018}
10019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020#define FILL(kind, data, value, start, length) \
10021 do { \
10022 Py_ssize_t i_ = 0; \
10023 assert(kind != PyUnicode_WCHAR_KIND); \
10024 switch ((kind)) { \
10025 case PyUnicode_1BYTE_KIND: { \
10026 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010027 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 break; \
10029 } \
10030 case PyUnicode_2BYTE_KIND: { \
10031 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10032 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10033 break; \
10034 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010035 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10037 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10038 break; \
10039 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010040 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 } \
10042 } while (0)
10043
Victor Stinnerd3f08822012-05-29 12:57:52 +020010044void
10045_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10046 Py_UCS4 fill_char)
10047{
10048 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10049 const void *data = PyUnicode_DATA(unicode);
10050 assert(PyUnicode_IS_READY(unicode));
10051 assert(unicode_modifiable(unicode));
10052 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10053 assert(start >= 0);
10054 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10055 FILL(kind, data, fill_char, start, length);
10056}
10057
Victor Stinner3fe55312012-01-04 00:33:50 +010010058Py_ssize_t
10059PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10060 Py_UCS4 fill_char)
10061{
10062 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010063
10064 if (!PyUnicode_Check(unicode)) {
10065 PyErr_BadInternalCall();
10066 return -1;
10067 }
10068 if (PyUnicode_READY(unicode) == -1)
10069 return -1;
10070 if (unicode_check_modifiable(unicode))
10071 return -1;
10072
Victor Stinnerd3f08822012-05-29 12:57:52 +020010073 if (start < 0) {
10074 PyErr_SetString(PyExc_IndexError, "string index out of range");
10075 return -1;
10076 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010077 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10078 PyErr_SetString(PyExc_ValueError,
10079 "fill character is bigger than "
10080 "the string maximum character");
10081 return -1;
10082 }
10083
10084 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10085 length = Py_MIN(maxlen, length);
10086 if (length <= 0)
10087 return 0;
10088
Victor Stinnerd3f08822012-05-29 12:57:52 +020010089 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010090 return length;
10091}
10092
Victor Stinner9310abb2011-10-05 00:59:23 +020010093static PyObject *
10094pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010095 Py_ssize_t left,
10096 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 PyObject *u;
10100 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010101 int kind;
10102 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103
10104 if (left < 0)
10105 left = 0;
10106 if (right < 0)
10107 right = 0;
10108
Victor Stinnerc4b49542011-12-11 22:44:26 +010010109 if (left == 0 && right == 0)
10110 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10113 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010114 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10115 return NULL;
10116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010118 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010120 if (!u)
10121 return NULL;
10122
10123 kind = PyUnicode_KIND(u);
10124 data = PyUnicode_DATA(u);
10125 if (left)
10126 FILL(kind, data, fill, 0, left);
10127 if (right)
10128 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010129 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010130 assert(_PyUnicode_CheckConsistency(u, 1));
10131 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132}
10133
Alexander Belopolsky40018472011-02-26 01:02:56 +000010134PyObject *
10135PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
10139 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010140 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010142 if (PyUnicode_READY(string) == -1) {
10143 Py_DECREF(string);
10144 return NULL;
10145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
Benjamin Petersonead6b532011-12-20 17:23:42 -060010147 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010149 if (PyUnicode_IS_ASCII(string))
10150 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010152 PyUnicode_GET_LENGTH(string), keepends);
10153 else
10154 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010156 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 break;
10158 case PyUnicode_2BYTE_KIND:
10159 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010160 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 PyUnicode_GET_LENGTH(string), keepends);
10162 break;
10163 case PyUnicode_4BYTE_KIND:
10164 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 PyUnicode_GET_LENGTH(string), keepends);
10167 break;
10168 default:
10169 assert(0);
10170 list = 0;
10171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 Py_DECREF(string);
10173 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174}
10175
Alexander Belopolsky40018472011-02-26 01:02:56 +000010176static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010177split(PyObject *self,
10178 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010179 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010181 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 void *buf1, *buf2;
10183 Py_ssize_t len1, len2;
10184 PyObject* out;
10185
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010187 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (PyUnicode_READY(self) == -1)
10190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010193 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 if (PyUnicode_IS_ASCII(self))
10196 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 PyUnicode_GET_LENGTH(self), maxcount
10199 );
10200 else
10201 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010203 PyUnicode_GET_LENGTH(self), maxcount
10204 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 case PyUnicode_2BYTE_KIND:
10206 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010207 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 PyUnicode_GET_LENGTH(self), maxcount
10209 );
10210 case PyUnicode_4BYTE_KIND:
10211 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 PyUnicode_GET_LENGTH(self), maxcount
10214 );
10215 default:
10216 assert(0);
10217 return NULL;
10218 }
10219
10220 if (PyUnicode_READY(substring) == -1)
10221 return NULL;
10222
10223 kind1 = PyUnicode_KIND(self);
10224 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 len1 = PyUnicode_GET_LENGTH(self);
10226 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010227 if (kind1 < kind2 || len1 < len2) {
10228 out = PyList_New(1);
10229 if (out == NULL)
10230 return NULL;
10231 Py_INCREF(self);
10232 PyList_SET_ITEM(out, 0, self);
10233 return out;
10234 }
10235 buf1 = PyUnicode_DATA(self);
10236 buf2 = PyUnicode_DATA(substring);
10237 if (kind2 != kind1) {
10238 buf2 = _PyUnicode_AsKind(substring, kind1);
10239 if (!buf2)
10240 return NULL;
10241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010243 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010245 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10246 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010247 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010248 else
10249 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010250 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 break;
10252 case PyUnicode_2BYTE_KIND:
10253 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010254 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 break;
10256 case PyUnicode_4BYTE_KIND:
10257 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 break;
10260 default:
10261 out = NULL;
10262 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010263 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 PyMem_Free(buf2);
10265 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266}
10267
Alexander Belopolsky40018472011-02-26 01:02:56 +000010268static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010269rsplit(PyObject *self,
10270 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010271 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010272{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010273 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 void *buf1, *buf2;
10275 Py_ssize_t len1, len2;
10276 PyObject* out;
10277
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010278 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010279 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (PyUnicode_READY(self) == -1)
10282 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010285 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 if (PyUnicode_IS_ASCII(self))
10288 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010290 PyUnicode_GET_LENGTH(self), maxcount
10291 );
10292 else
10293 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 PyUnicode_GET_LENGTH(self), maxcount
10296 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 case PyUnicode_2BYTE_KIND:
10298 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 PyUnicode_GET_LENGTH(self), maxcount
10301 );
10302 case PyUnicode_4BYTE_KIND:
10303 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 PyUnicode_GET_LENGTH(self), maxcount
10306 );
10307 default:
10308 assert(0);
10309 return NULL;
10310 }
10311
10312 if (PyUnicode_READY(substring) == -1)
10313 return NULL;
10314
10315 kind1 = PyUnicode_KIND(self);
10316 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 len1 = PyUnicode_GET_LENGTH(self);
10318 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010319 if (kind1 < kind2 || len1 < len2) {
10320 out = PyList_New(1);
10321 if (out == NULL)
10322 return NULL;
10323 Py_INCREF(self);
10324 PyList_SET_ITEM(out, 0, self);
10325 return out;
10326 }
10327 buf1 = PyUnicode_DATA(self);
10328 buf2 = PyUnicode_DATA(substring);
10329 if (kind2 != kind1) {
10330 buf2 = _PyUnicode_AsKind(substring, kind1);
10331 if (!buf2)
10332 return NULL;
10333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010335 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010337 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10338 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010340 else
10341 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010342 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 break;
10344 case PyUnicode_2BYTE_KIND:
10345 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 break;
10348 case PyUnicode_4BYTE_KIND:
10349 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 break;
10352 default:
10353 out = NULL;
10354 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010355 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 PyMem_Free(buf2);
10357 return out;
10358}
10359
10360static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10362 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010364 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010366 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10367 return asciilib_find(buf1, len1, buf2, len2, offset);
10368 else
10369 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 case PyUnicode_2BYTE_KIND:
10371 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10372 case PyUnicode_4BYTE_KIND:
10373 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10374 }
10375 assert(0);
10376 return -1;
10377}
10378
10379static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10381 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010383 switch (kind) {
10384 case PyUnicode_1BYTE_KIND:
10385 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10386 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10387 else
10388 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10389 case PyUnicode_2BYTE_KIND:
10390 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10391 case PyUnicode_4BYTE_KIND:
10392 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10393 }
10394 assert(0);
10395 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010396}
10397
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010398static void
10399replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10400 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10401{
10402 int kind = PyUnicode_KIND(u);
10403 void *data = PyUnicode_DATA(u);
10404 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10405 if (kind == PyUnicode_1BYTE_KIND) {
10406 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10407 (Py_UCS1 *)data + len,
10408 u1, u2, maxcount);
10409 }
10410 else if (kind == PyUnicode_2BYTE_KIND) {
10411 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10412 (Py_UCS2 *)data + len,
10413 u1, u2, maxcount);
10414 }
10415 else {
10416 assert(kind == PyUnicode_4BYTE_KIND);
10417 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10418 (Py_UCS4 *)data + len,
10419 u1, u2, maxcount);
10420 }
10421}
10422
Alexander Belopolsky40018472011-02-26 01:02:56 +000010423static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424replace(PyObject *self, PyObject *str1,
10425 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 PyObject *u;
10428 char *sbuf = PyUnicode_DATA(self);
10429 char *buf1 = PyUnicode_DATA(str1);
10430 char *buf2 = PyUnicode_DATA(str2);
10431 int srelease = 0, release1 = 0, release2 = 0;
10432 int skind = PyUnicode_KIND(self);
10433 int kind1 = PyUnicode_KIND(str1);
10434 int kind2 = PyUnicode_KIND(str2);
10435 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10436 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10437 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010438 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010439 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
10441 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010442 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010444 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445
Victor Stinner59de0ee2011-10-07 10:01:28 +020010446 if (str1 == str2)
10447 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448
Victor Stinner49a0a212011-10-12 23:46:10 +020010449 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010450 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10451 if (maxchar < maxchar_str1)
10452 /* substring too wide to be present */
10453 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010454 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10455 /* Replacing str1 with str2 may cause a maxchar reduction in the
10456 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010457 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010458 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010463 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010466 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010467 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010468
Victor Stinner69ed0f42013-04-09 21:48:24 +020010469 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010470 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010471 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010473 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010475 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010477
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010478 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10479 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010480 }
10481 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 int rkind = skind;
10483 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010484 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (kind1 < rkind) {
10487 /* widen substring */
10488 buf1 = _PyUnicode_AsKind(str1, rkind);
10489 if (!buf1) goto error;
10490 release1 = 1;
10491 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010492 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493 if (i < 0)
10494 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 if (rkind > kind2) {
10496 /* widen replacement */
10497 buf2 = _PyUnicode_AsKind(str2, rkind);
10498 if (!buf2) goto error;
10499 release2 = 1;
10500 }
10501 else if (rkind < kind2) {
10502 /* widen self and buf1 */
10503 rkind = kind2;
10504 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010505 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 sbuf = _PyUnicode_AsKind(self, rkind);
10507 if (!sbuf) goto error;
10508 srelease = 1;
10509 buf1 = _PyUnicode_AsKind(str1, rkind);
10510 if (!buf1) goto error;
10511 release1 = 1;
10512 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010513 u = PyUnicode_New(slen, maxchar);
10514 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010516 assert(PyUnicode_KIND(u) == rkind);
10517 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010518
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010519 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010520 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010521 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010525
10526 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010527 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010528 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010529 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010530 if (i == -1)
10531 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010532 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010534 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 }
10539 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010541 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 int rkind = skind;
10543 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 buf1 = _PyUnicode_AsKind(str1, rkind);
10548 if (!buf1) goto error;
10549 release1 = 1;
10550 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010551 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552 if (n == 0)
10553 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010555 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 buf2 = _PyUnicode_AsKind(str2, rkind);
10557 if (!buf2) goto error;
10558 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010561 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 rkind = kind2;
10563 sbuf = _PyUnicode_AsKind(self, rkind);
10564 if (!sbuf) goto error;
10565 srelease = 1;
10566 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010567 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 buf1 = _PyUnicode_AsKind(str1, rkind);
10569 if (!buf1) goto error;
10570 release1 = 1;
10571 }
10572 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10573 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010574 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 PyErr_SetString(PyExc_OverflowError,
10576 "replace string is too long");
10577 goto error;
10578 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010579 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010581 _Py_INCREF_UNICODE_EMPTY();
10582 if (!unicode_empty)
10583 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 u = unicode_empty;
10585 goto done;
10586 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010587 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 PyErr_SetString(PyExc_OverflowError,
10589 "replace string is too long");
10590 goto error;
10591 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010592 u = PyUnicode_New(new_size, maxchar);
10593 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010595 assert(PyUnicode_KIND(u) == rkind);
10596 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 ires = i = 0;
10598 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 while (n-- > 0) {
10600 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010601 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010602 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010604 if (j == -1)
10605 break;
10606 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010608 memcpy(res + rkind * ires,
10609 sbuf + rkind * i,
10610 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 }
10613 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010615 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010617 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010624 memcpy(res + rkind * ires,
10625 sbuf + rkind * i,
10626 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010627 }
10628 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 /* interleave */
10630 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010631 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 if (--n <= 0)
10636 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010637 memcpy(res + rkind * ires,
10638 sbuf + rkind * i,
10639 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 ires++;
10641 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 memcpy(res + rkind * ires,
10644 sbuf + rkind * i,
10645 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 }
10648
10649 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010650 unicode_adjust_maxchar(&u);
10651 if (u == NULL)
10652 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010654
10655 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (srelease)
10657 PyMem_FREE(sbuf);
10658 if (release1)
10659 PyMem_FREE(buf1);
10660 if (release2)
10661 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010662 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (srelease)
10668 PyMem_FREE(sbuf);
10669 if (release1)
10670 PyMem_FREE(buf1);
10671 if (release2)
10672 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010673 return unicode_result_unchanged(self);
10674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 error:
10676 if (srelease && sbuf)
10677 PyMem_FREE(sbuf);
10678 if (release1 && buf1)
10679 PyMem_FREE(buf1);
10680 if (release2 && buf2)
10681 PyMem_FREE(buf2);
10682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683}
10684
10685/* --- Unicode Object Methods --------------------------------------------- */
10686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010687PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010688 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689\n\
10690Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010691characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692
10693static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010694unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010696 if (PyUnicode_READY(self) == -1)
10697 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010698 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699}
10700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703\n\
10704Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010705have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
10707static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010708unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010710 if (PyUnicode_READY(self) == -1)
10711 return NULL;
10712 if (PyUnicode_GET_LENGTH(self) == 0)
10713 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010714 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715}
10716
Benjamin Petersond5890c82012-01-14 13:23:30 -050010717PyDoc_STRVAR(casefold__doc__,
10718 "S.casefold() -> str\n\
10719\n\
10720Return a version of S suitable for caseless comparisons.");
10721
10722static PyObject *
10723unicode_casefold(PyObject *self)
10724{
10725 if (PyUnicode_READY(self) == -1)
10726 return NULL;
10727 if (PyUnicode_IS_ASCII(self))
10728 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010729 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010730}
10731
10732
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010733/* Argument converter. Coerces to a single unicode character */
10734
10735static int
10736convert_uc(PyObject *obj, void *addr)
10737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010739 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010740
Benjamin Peterson14339b62009-01-31 16:36:08 +000010741 uniobj = PyUnicode_FromObject(obj);
10742 if (uniobj == NULL) {
10743 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745 return 0;
10746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010748 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 Py_DECREF(uniobj);
10751 return 0;
10752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010754 Py_DECREF(uniobj);
10755 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010756}
10757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010758PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010761Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010762done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
10764static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010765unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010767 Py_ssize_t marg, left;
10768 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 Py_UCS4 fillchar = ' ';
10770
Victor Stinnere9a29352011-10-01 02:14:59 +020010771 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
Benjamin Petersonbac79492012-01-14 13:34:47 -050010774 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775 return NULL;
10776
Victor Stinnerc4b49542011-12-11 22:44:26 +010010777 if (PyUnicode_GET_LENGTH(self) >= width)
10778 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
Victor Stinnerc4b49542011-12-11 22:44:26 +010010780 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781 left = marg / 2 + (marg & width & 1);
10782
Victor Stinner9310abb2011-10-05 00:59:23 +020010783 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784}
10785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786/* This function assumes that str1 and str2 are readied by the caller. */
10787
Marc-André Lemburge5034372000-08-08 08:04:29 +000010788static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010789unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010790{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010791#define COMPARE(TYPE1, TYPE2) \
10792 do { \
10793 TYPE1* p1 = (TYPE1 *)data1; \
10794 TYPE2* p2 = (TYPE2 *)data2; \
10795 TYPE1* end = p1 + len; \
10796 Py_UCS4 c1, c2; \
10797 for (; p1 != end; p1++, p2++) { \
10798 c1 = *p1; \
10799 c2 = *p2; \
10800 if (c1 != c2) \
10801 return (c1 < c2) ? -1 : 1; \
10802 } \
10803 } \
10804 while (0)
10805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 int kind1, kind2;
10807 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010808 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 kind1 = PyUnicode_KIND(str1);
10811 kind2 = PyUnicode_KIND(str2);
10812 data1 = PyUnicode_DATA(str1);
10813 data2 = PyUnicode_DATA(str2);
10814 len1 = PyUnicode_GET_LENGTH(str1);
10815 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010816 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010817
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010818 switch(kind1) {
10819 case PyUnicode_1BYTE_KIND:
10820 {
10821 switch(kind2) {
10822 case PyUnicode_1BYTE_KIND:
10823 {
10824 int cmp = memcmp(data1, data2, len);
10825 /* normalize result of memcmp() into the range [-1; 1] */
10826 if (cmp < 0)
10827 return -1;
10828 if (cmp > 0)
10829 return 1;
10830 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010831 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010832 case PyUnicode_2BYTE_KIND:
10833 COMPARE(Py_UCS1, Py_UCS2);
10834 break;
10835 case PyUnicode_4BYTE_KIND:
10836 COMPARE(Py_UCS1, Py_UCS4);
10837 break;
10838 default:
10839 assert(0);
10840 }
10841 break;
10842 }
10843 case PyUnicode_2BYTE_KIND:
10844 {
10845 switch(kind2) {
10846 case PyUnicode_1BYTE_KIND:
10847 COMPARE(Py_UCS2, Py_UCS1);
10848 break;
10849 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010850 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010851 COMPARE(Py_UCS2, Py_UCS2);
10852 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010853 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010854 case PyUnicode_4BYTE_KIND:
10855 COMPARE(Py_UCS2, Py_UCS4);
10856 break;
10857 default:
10858 assert(0);
10859 }
10860 break;
10861 }
10862 case PyUnicode_4BYTE_KIND:
10863 {
10864 switch(kind2) {
10865 case PyUnicode_1BYTE_KIND:
10866 COMPARE(Py_UCS4, Py_UCS1);
10867 break;
10868 case PyUnicode_2BYTE_KIND:
10869 COMPARE(Py_UCS4, Py_UCS2);
10870 break;
10871 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010872 {
10873#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10874 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10875 /* normalize result of wmemcmp() into the range [-1; 1] */
10876 if (cmp < 0)
10877 return -1;
10878 if (cmp > 0)
10879 return 1;
10880#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010881 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010882#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010883 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010884 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 default:
10886 assert(0);
10887 }
10888 break;
10889 }
10890 default:
10891 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010892 }
10893
Victor Stinner770e19e2012-10-04 22:59:45 +020010894 if (len1 == len2)
10895 return 0;
10896 if (len1 < len2)
10897 return -1;
10898 else
10899 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010900
10901#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010902}
10903
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010904Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010905unicode_compare_eq(PyObject *str1, PyObject *str2)
10906{
10907 int kind;
10908 void *data1, *data2;
10909 Py_ssize_t len;
10910 int cmp;
10911
Victor Stinnere5567ad2012-10-23 02:48:49 +020010912 len = PyUnicode_GET_LENGTH(str1);
10913 if (PyUnicode_GET_LENGTH(str2) != len)
10914 return 0;
10915 kind = PyUnicode_KIND(str1);
10916 if (PyUnicode_KIND(str2) != kind)
10917 return 0;
10918 data1 = PyUnicode_DATA(str1);
10919 data2 = PyUnicode_DATA(str2);
10920
10921 cmp = memcmp(data1, data2, len * kind);
10922 return (cmp == 0);
10923}
10924
10925
Alexander Belopolsky40018472011-02-26 01:02:56 +000010926int
10927PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10930 if (PyUnicode_READY(left) == -1 ||
10931 PyUnicode_READY(right) == -1)
10932 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010933
10934 /* a string is equal to itself */
10935 if (left == right)
10936 return 0;
10937
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010938 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010940 PyErr_Format(PyExc_TypeError,
10941 "Can't compare %.100s and %.100s",
10942 left->ob_type->tp_name,
10943 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 return -1;
10945}
10946
Martin v. Löwis5b222132007-06-10 09:51:05 +000010947int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010948_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10949{
10950 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10951 if (right_str == NULL)
10952 return -1;
10953 return PyUnicode_Compare(left, right_str);
10954}
10955
10956int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010957PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 Py_ssize_t i;
10960 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 Py_UCS4 chr;
10962
Victor Stinner910337b2011-10-03 03:20:16 +020010963 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 if (PyUnicode_READY(uni) == -1)
10965 return -1;
10966 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010967 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010968 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010969 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010970 size_t len, len2 = strlen(str);
10971 int cmp;
10972
10973 len = Py_MIN(len1, len2);
10974 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010975 if (cmp != 0) {
10976 if (cmp < 0)
10977 return -1;
10978 else
10979 return 1;
10980 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010981 if (len1 > len2)
10982 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010983 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010984 return -1; /* str is longer */
10985 return 0;
10986 }
10987 else {
10988 void *data = PyUnicode_DATA(uni);
10989 /* Compare Unicode string and source character set string */
10990 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010991 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010992 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10993 /* This check keeps Python strings that end in '\0' from comparing equal
10994 to C strings identical up to that point. */
10995 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10996 return 1; /* uni is longer */
10997 if (str[i])
10998 return -1; /* str is longer */
10999 return 0;
11000 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011001}
11002
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011003
Benjamin Peterson29060642009-01-31 22:14:21 +000011004#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011005 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011006
Alexander Belopolsky40018472011-02-26 01:02:56 +000011007PyObject *
11008PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011009{
11010 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011011 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011012
Victor Stinnere5567ad2012-10-23 02:48:49 +020011013 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11014 Py_RETURN_NOTIMPLEMENTED;
11015
11016 if (PyUnicode_READY(left) == -1 ||
11017 PyUnicode_READY(right) == -1)
11018 return NULL;
11019
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011020 if (left == right) {
11021 switch (op) {
11022 case Py_EQ:
11023 case Py_LE:
11024 case Py_GE:
11025 /* a string is equal to itself */
11026 v = Py_True;
11027 break;
11028 case Py_NE:
11029 case Py_LT:
11030 case Py_GT:
11031 v = Py_False;
11032 break;
11033 default:
11034 PyErr_BadArgument();
11035 return NULL;
11036 }
11037 }
11038 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011039 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011040 result ^= (op == Py_NE);
11041 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011042 }
11043 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011044 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011045
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011046 /* Convert the return value to a Boolean */
11047 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011048 case Py_LE:
11049 v = TEST_COND(result <= 0);
11050 break;
11051 case Py_GE:
11052 v = TEST_COND(result >= 0);
11053 break;
11054 case Py_LT:
11055 v = TEST_COND(result == -1);
11056 break;
11057 case Py_GT:
11058 v = TEST_COND(result == 1);
11059 break;
11060 default:
11061 PyErr_BadArgument();
11062 return NULL;
11063 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011064 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011065 Py_INCREF(v);
11066 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011067}
11068
Alexander Belopolsky40018472011-02-26 01:02:56 +000011069int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011070_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11071{
11072 return unicode_eq(aa, bb);
11073}
11074
11075int
Alexander Belopolsky40018472011-02-26 01:02:56 +000011076PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011077{
Thomas Wouters477c8d52006-05-27 19:21:47 +000011078 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020011079 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 void *buf1, *buf2;
11081 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011082 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011083
11084 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000011085 sub = PyUnicode_FromObject(element);
11086 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011087 PyErr_Format(PyExc_TypeError,
11088 "'in <string>' requires string as left operand, not %s",
11089 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011091 }
11092
Thomas Wouters477c8d52006-05-27 19:21:47 +000011093 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011094 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011095 Py_DECREF(sub);
11096 return -1;
11097 }
11098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 kind1 = PyUnicode_KIND(str);
11100 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011101 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050011103 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011104 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 }
11106 len1 = PyUnicode_GET_LENGTH(str);
11107 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011108 if (len1 < len2) {
11109 Py_DECREF(sub);
11110 Py_DECREF(str);
11111 return 0;
11112 }
11113 buf1 = PyUnicode_DATA(str);
11114 buf2 = PyUnicode_DATA(sub);
11115 if (len2 == 1) {
11116 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11117 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11118 Py_DECREF(sub);
11119 Py_DECREF(str);
11120 return result;
11121 }
11122 if (kind2 != kind1) {
11123 buf2 = _PyUnicode_AsKind(sub, kind1);
11124 if (!buf2) {
11125 Py_DECREF(sub);
11126 Py_DECREF(str);
11127 return -1;
11128 }
11129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130
Victor Stinner77282cb2013-04-14 19:22:47 +020011131 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 case PyUnicode_1BYTE_KIND:
11133 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11134 break;
11135 case PyUnicode_2BYTE_KIND:
11136 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11137 break;
11138 case PyUnicode_4BYTE_KIND:
11139 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11140 break;
11141 default:
11142 result = -1;
11143 assert(0);
11144 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011145
11146 Py_DECREF(str);
11147 Py_DECREF(sub);
11148
Victor Stinner77282cb2013-04-14 19:22:47 +020011149 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 PyMem_Free(buf2);
11151
Guido van Rossum403d68b2000-03-13 15:55:09 +000011152 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011153}
11154
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155/* Concat to string or Unicode object giving a new Unicode object. */
11156
Alexander Belopolsky40018472011-02-26 01:02:56 +000011157PyObject *
11158PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011161 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011162 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163
11164 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
11172 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011173 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011177 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 }
11181
Victor Stinner488fa492011-12-12 00:01:39 +010011182 u_len = PyUnicode_GET_LENGTH(u);
11183 v_len = PyUnicode_GET_LENGTH(v);
11184 if (u_len > PY_SSIZE_T_MAX - v_len) {
11185 PyErr_SetString(PyExc_OverflowError,
11186 "strings are too large to concat");
11187 goto onError;
11188 }
11189 new_len = u_len + v_len;
11190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011192 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011193 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011196 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011198 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011199 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11200 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 Py_DECREF(u);
11202 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011203 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205
Benjamin Peterson29060642009-01-31 22:14:21 +000011206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 Py_XDECREF(u);
11208 Py_XDECREF(v);
11209 return NULL;
11210}
11211
Walter Dörwald1ab83302007-05-18 17:15:44 +000011212void
Victor Stinner23e56682011-10-03 03:54:37 +020011213PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011214{
Victor Stinner23e56682011-10-03 03:54:37 +020011215 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011216 Py_UCS4 maxchar, maxchar2;
11217 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011218
11219 if (p_left == NULL) {
11220 if (!PyErr_Occurred())
11221 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011222 return;
11223 }
Victor Stinner23e56682011-10-03 03:54:37 +020011224 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011225 if (right == NULL || left == NULL
11226 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011227 if (!PyErr_Occurred())
11228 PyErr_BadInternalCall();
11229 goto error;
11230 }
11231
Benjamin Petersonbac79492012-01-14 13:34:47 -050011232 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011233 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011234 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011235 goto error;
11236
Victor Stinner488fa492011-12-12 00:01:39 +010011237 /* Shortcuts */
11238 if (left == unicode_empty) {
11239 Py_DECREF(left);
11240 Py_INCREF(right);
11241 *p_left = right;
11242 return;
11243 }
11244 if (right == unicode_empty)
11245 return;
11246
11247 left_len = PyUnicode_GET_LENGTH(left);
11248 right_len = PyUnicode_GET_LENGTH(right);
11249 if (left_len > PY_SSIZE_T_MAX - right_len) {
11250 PyErr_SetString(PyExc_OverflowError,
11251 "strings are too large to concat");
11252 goto error;
11253 }
11254 new_len = left_len + right_len;
11255
11256 if (unicode_modifiable(left)
11257 && PyUnicode_CheckExact(right)
11258 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011259 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11260 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011261 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011262 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011263 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11264 {
11265 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011266 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011267 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011268
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011269 /* copy 'right' into the newly allocated area of 'left' */
11270 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011271 }
Victor Stinner488fa492011-12-12 00:01:39 +010011272 else {
11273 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11274 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011275 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011276
Victor Stinner488fa492011-12-12 00:01:39 +010011277 /* Concat the two Unicode strings */
11278 res = PyUnicode_New(new_len, maxchar);
11279 if (res == NULL)
11280 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011281 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11282 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011283 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011284 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011285 }
11286 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011287 return;
11288
11289error:
Victor Stinner488fa492011-12-12 00:01:39 +010011290 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011291}
11292
11293void
11294PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11295{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011296 PyUnicode_Append(pleft, right);
11297 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011298}
11299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011303Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011304string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
11307static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011308unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011310 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011311 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011312 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011314 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 void *buf1, *buf2;
11316 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317
Jesus Ceaac451502011-04-20 17:09:23 +020011318 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11319 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011320 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 kind1 = PyUnicode_KIND(self);
11323 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011324 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011325 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011326 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 len1 = PyUnicode_GET_LENGTH(self);
11329 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011331 if (end - start < len2) {
11332 Py_DECREF(substring);
11333 return PyLong_FromLong(0);
11334 }
11335 buf1 = PyUnicode_DATA(self);
11336 buf2 = PyUnicode_DATA(substring);
11337 if (kind2 != kind1) {
11338 buf2 = _PyUnicode_AsKind(substring, kind1);
11339 if (!buf2) {
11340 Py_DECREF(substring);
11341 return NULL;
11342 }
11343 }
11344 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 case PyUnicode_1BYTE_KIND:
11346 iresult = ucs1lib_count(
11347 ((Py_UCS1*)buf1) + start, end - start,
11348 buf2, len2, PY_SSIZE_T_MAX
11349 );
11350 break;
11351 case PyUnicode_2BYTE_KIND:
11352 iresult = ucs2lib_count(
11353 ((Py_UCS2*)buf1) + start, end - start,
11354 buf2, len2, PY_SSIZE_T_MAX
11355 );
11356 break;
11357 case PyUnicode_4BYTE_KIND:
11358 iresult = ucs4lib_count(
11359 ((Py_UCS4*)buf1) + start, end - start,
11360 buf2, len2, PY_SSIZE_T_MAX
11361 );
11362 break;
11363 default:
11364 assert(0); iresult = 0;
11365 }
11366
11367 result = PyLong_FromSsize_t(iresult);
11368
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011369 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011373
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 return result;
11375}
11376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011378 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011380Encode S using the codec registered for encoding. Default encoding\n\
11381is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011382handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011383a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11384'xmlcharrefreplace' as well as any other name registered with\n\
11385codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386
11387static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011388unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011390 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391 char *encoding = NULL;
11392 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011393
Benjamin Peterson308d6372009-09-18 21:42:35 +000011394 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11395 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011397 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011398}
11399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011401 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402\n\
11403Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011407unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011409 Py_ssize_t i, j, line_pos, src_len, incr;
11410 Py_UCS4 ch;
11411 PyObject *u;
11412 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011413 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011415 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011416 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
Ezio Melotti745d54d2013-11-16 19:10:57 +020011418 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11419 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Antoine Pitrou22425222011-10-04 19:10:51 +020011422 if (PyUnicode_READY(self) == -1)
11423 return NULL;
11424
Thomas Wouters7e474022000-07-16 12:04:32 +000011425 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011426 src_len = PyUnicode_GET_LENGTH(self);
11427 i = j = line_pos = 0;
11428 kind = PyUnicode_KIND(self);
11429 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011430 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011431 for (; i < src_len; i++) {
11432 ch = PyUnicode_READ(kind, src_data, i);
11433 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011434 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011436 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011438 goto overflow;
11439 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011441 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 goto overflow;
11446 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011448 if (ch == '\n' || ch == '\r')
11449 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011451 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011452 if (!found)
11453 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011454
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011456 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 if (!u)
11458 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011459 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Antoine Pitroue71d5742011-10-04 15:55:09 +020011461 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 for (; i < src_len; i++) {
11464 ch = PyUnicode_READ(kind, src_data, i);
11465 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 incr = tabsize - (line_pos % tabsize);
11468 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011469 FILL(kind, dest_data, ' ', j, incr);
11470 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011472 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011474 line_pos++;
11475 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011476 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011477 if (ch == '\n' || ch == '\r')
11478 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011480 }
11481 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011482 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011483
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011485 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487}
11488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011489PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491\n\
11492Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011493such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494arguments start and end are interpreted as in slice notation.\n\
11495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
11498static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011501 /* initialize variables to prevent gcc warning */
11502 PyObject *substring = NULL;
11503 Py_ssize_t start = 0;
11504 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011505 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Jesus Ceaac451502011-04-20 17:09:23 +020011507 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11508 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Christian Heimesd47802e2013-06-29 21:33:36 +020011511 if (PyUnicode_READY(self) == -1) {
11512 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011514 }
11515 if (PyUnicode_READY(substring) == -1) {
11516 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519
Victor Stinner7931d9a2011-11-04 00:22:48 +010011520 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521
11522 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (result == -2)
11525 return NULL;
11526
Christian Heimes217cfd12007-12-02 14:31:20 +000011527 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528}
11529
11530static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011531unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011533 void *data;
11534 enum PyUnicode_Kind kind;
11535 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011536
11537 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11538 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011540 }
11541 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11542 PyErr_SetString(PyExc_IndexError, "string index out of range");
11543 return NULL;
11544 }
11545 kind = PyUnicode_KIND(self);
11546 data = PyUnicode_DATA(self);
11547 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011548 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549}
11550
Guido van Rossumc2504932007-09-18 19:42:40 +000011551/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011552 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011553static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011554unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Guido van Rossumc2504932007-09-18 19:42:40 +000011556 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011557 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011558
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011559#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011560 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011561#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 if (_PyUnicode_HASH(self) != -1)
11563 return _PyUnicode_HASH(self);
11564 if (PyUnicode_READY(self) == -1)
11565 return -1;
11566 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011567 /*
11568 We make the hash of the empty string be 0, rather than using
11569 (prefix ^ suffix), since this slightly obfuscates the hash secret
11570 */
11571 if (len == 0) {
11572 _PyUnicode_HASH(self) = 0;
11573 return 0;
11574 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011575 x = _Py_HashBytes(PyUnicode_DATA(self),
11576 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011578 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579}
11580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011584Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
11586static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011589 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011590 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011591 PyObject *substring = NULL;
11592 Py_ssize_t start = 0;
11593 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
Jesus Ceaac451502011-04-20 17:09:23 +020011595 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11596 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
Christian Heimesd47a0452013-06-29 21:21:37 +020011599 if (PyUnicode_READY(self) == -1) {
11600 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011602 }
11603 if (PyUnicode_READY(substring) == -1) {
11604 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607
Victor Stinner7931d9a2011-11-04 00:22:48 +010011608 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
11610 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 if (result == -2)
11613 return NULL;
11614
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615 if (result < 0) {
11616 PyErr_SetString(PyExc_ValueError, "substring not found");
11617 return NULL;
11618 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011619
Christian Heimes217cfd12007-12-02 14:31:20 +000011620 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011626Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 Py_ssize_t i, length;
11633 int kind;
11634 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 int cased;
11636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (PyUnicode_READY(self) == -1)
11638 return NULL;
11639 length = PyUnicode_GET_LENGTH(self);
11640 kind = PyUnicode_KIND(self);
11641 data = PyUnicode_DATA(self);
11642
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 if (length == 1)
11645 return PyBool_FromLong(
11646 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011648 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011651
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 for (i = 0; i < length; i++) {
11654 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011655
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11657 return PyBool_FromLong(0);
11658 else if (!cased && Py_UNICODE_ISLOWER(ch))
11659 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011661 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662}
11663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011664PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011667Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011668at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669
11670static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011671unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 Py_ssize_t i, length;
11674 int kind;
11675 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676 int cased;
11677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (PyUnicode_READY(self) == -1)
11679 return NULL;
11680 length = PyUnicode_GET_LENGTH(self);
11681 kind = PyUnicode_KIND(self);
11682 data = PyUnicode_DATA(self);
11683
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (length == 1)
11686 return PyBool_FromLong(
11687 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011689 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011692
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 for (i = 0; i < length; i++) {
11695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011696
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11698 return PyBool_FromLong(0);
11699 else if (!cased && Py_UNICODE_ISUPPER(ch))
11700 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011702 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703}
11704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011705PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011708Return True if S is a titlecased string and there is at least one\n\
11709character in S, i.e. upper- and titlecase characters may only\n\
11710follow uncased characters and lowercase characters only cased ones.\n\
11711Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
11713static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011714unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 Py_ssize_t i, length;
11717 int kind;
11718 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 int cased, previous_is_cased;
11720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (PyUnicode_READY(self) == -1)
11722 return NULL;
11723 length = PyUnicode_GET_LENGTH(self);
11724 kind = PyUnicode_KIND(self);
11725 data = PyUnicode_DATA(self);
11726
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (length == 1) {
11729 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11730 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11731 (Py_UNICODE_ISUPPER(ch) != 0));
11732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011734 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011737
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738 cased = 0;
11739 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 for (i = 0; i < length; i++) {
11741 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011742
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11744 if (previous_is_cased)
11745 return PyBool_FromLong(0);
11746 previous_is_cased = 1;
11747 cased = 1;
11748 }
11749 else if (Py_UNICODE_ISLOWER(ch)) {
11750 if (!previous_is_cased)
11751 return PyBool_FromLong(0);
11752 previous_is_cased = 1;
11753 cased = 1;
11754 }
11755 else
11756 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011758 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011764Return True if all characters in S are whitespace\n\
11765and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 Py_ssize_t i, length;
11771 int kind;
11772 void *data;
11773
11774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776 length = PyUnicode_GET_LENGTH(self);
11777 kind = PyUnicode_KIND(self);
11778 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 1)
11782 return PyBool_FromLong(
11783 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011785 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 for (i = 0; i < length; i++) {
11790 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011791 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011794 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795}
11796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011799\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011800Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011802
11803static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 Py_ssize_t i, length;
11807 int kind;
11808 void *data;
11809
11810 if (PyUnicode_READY(self) == -1)
11811 return NULL;
11812 length = PyUnicode_GET_LENGTH(self);
11813 kind = PyUnicode_KIND(self);
11814 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011815
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011816 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (length == 1)
11818 return PyBool_FromLong(
11819 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820
11821 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 for (i = 0; i < length; i++) {
11826 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011829 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011830}
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011835Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011837
11838static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011839unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 int kind;
11842 void *data;
11843 Py_ssize_t len, i;
11844
11845 if (PyUnicode_READY(self) == -1)
11846 return NULL;
11847
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_DATA(self);
11850 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011851
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011852 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (len == 1) {
11854 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11855 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11856 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011857
11858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 for (i = 0; i < len; i++) {
11863 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011864 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011866 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011867 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011868}
11869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011870PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011873Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011874False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875
11876static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011877unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 Py_ssize_t i, length;
11880 int kind;
11881 void *data;
11882
11883 if (PyUnicode_READY(self) == -1)
11884 return NULL;
11885 length = PyUnicode_GET_LENGTH(self);
11886 kind = PyUnicode_KIND(self);
11887 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (length == 1)
11891 return PyBool_FromLong(
11892 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 for (i = 0; i < length; i++) {
11899 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011908Return True if all characters in S are digits\n\
11909and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
11911static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011912unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
11917
11918 if (PyUnicode_READY(self) == -1)
11919 return NULL;
11920 length = PyUnicode_GET_LENGTH(self);
11921 kind = PyUnicode_KIND(self);
11922 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (length == 1) {
11926 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11927 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 for (i = 0; i < length; i++) {
11935 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011938 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939}
11940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011941PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011944Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011945False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
11947static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011948unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 Py_ssize_t i, length;
11951 int kind;
11952 void *data;
11953
11954 if (PyUnicode_READY(self) == -1)
11955 return NULL;
11956 length = PyUnicode_GET_LENGTH(self);
11957 kind = PyUnicode_KIND(self);
11958 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 if (length == 1)
11962 return PyBool_FromLong(
11963 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 for (i = 0; i < length; i++) {
11970 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011973 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974}
11975
Martin v. Löwis47383402007-08-15 07:32:56 +000011976int
11977PyUnicode_IsIdentifier(PyObject *self)
11978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 int kind;
11980 void *data;
11981 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011982 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (PyUnicode_READY(self) == -1) {
11985 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 }
11988
11989 /* Special case for empty strings */
11990 if (PyUnicode_GET_LENGTH(self) == 0)
11991 return 0;
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011994
11995 /* PEP 3131 says that the first character must be in
11996 XID_Start and subsequent characters in XID_Continue,
11997 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011999 letters, digits, underscore). However, given the current
12000 definition of XID_Start and XID_Continue, it is sufficient
12001 to check just for these, except that _ must be allowed
12002 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012004 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012005 return 0;
12006
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012007 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012010 return 1;
12011}
12012
12013PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012015\n\
12016Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012017to the language definition.\n\
12018\n\
12019Use keyword.iskeyword() to test for reserved identifiers\n\
12020such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012021
12022static PyObject*
12023unicode_isidentifier(PyObject *self)
12024{
12025 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12026}
12027
Georg Brandl559e5d72008-06-11 18:37:52 +000012028PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012029 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012030\n\
12031Return True if all characters in S are considered\n\
12032printable in repr() or S is empty, False otherwise.");
12033
12034static PyObject*
12035unicode_isprintable(PyObject *self)
12036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 Py_ssize_t i, length;
12038 int kind;
12039 void *data;
12040
12041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 length = PyUnicode_GET_LENGTH(self);
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012046
12047 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 1)
12049 return PyBool_FromLong(
12050 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 for (i = 0; i < length; i++) {
12053 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012054 Py_RETURN_FALSE;
12055 }
12056 }
12057 Py_RETURN_TRUE;
12058}
12059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012060PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012061 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062\n\
12063Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012064iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
12066static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012067unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012069 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070}
12071
Martin v. Löwis18e16552006-02-15 17:27:45 +000012072static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012073unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (PyUnicode_READY(self) == -1)
12076 return -1;
12077 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078}
12079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012080PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012083Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012084done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
12086static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012087unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012089 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 Py_UCS4 fillchar = ' ';
12091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012092 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 return NULL;
12094
Benjamin Petersonbac79492012-01-14 13:34:47 -050012095 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
Victor Stinnerc4b49542011-12-11 22:44:26 +010012098 if (PyUnicode_GET_LENGTH(self) >= width)
12099 return unicode_result_unchanged(self);
12100
12101 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102}
12103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012104PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012107Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
12109static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012110unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012112 if (PyUnicode_READY(self) == -1)
12113 return NULL;
12114 if (PyUnicode_IS_ASCII(self))
12115 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012116 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119#define LEFTSTRIP 0
12120#define RIGHTSTRIP 1
12121#define BOTHSTRIP 2
12122
12123/* Arrays indexed by above */
12124static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12125
12126#define STRIPNAME(i) (stripformat[i]+3)
12127
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128/* externally visible for str.strip(unicode) */
12129PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012130_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 void *data;
12133 int kind;
12134 Py_ssize_t i, j, len;
12135 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012136 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12139 return NULL;
12140
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
12143 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012144 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12146 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012147 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012148
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 i = 0;
12150 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012151 while (i < len) {
12152 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12153 if (!BLOOM(sepmask, ch))
12154 break;
12155 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12156 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 i++;
12158 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012159 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160
Benjamin Peterson14339b62009-01-31 16:36:08 +000012161 j = len;
12162 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012163 j--;
12164 while (j >= i) {
12165 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12166 if (!BLOOM(sepmask, ch))
12167 break;
12168 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12169 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012171 }
12172
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012174 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012175
Victor Stinner7931d9a2011-11-04 00:22:48 +010012176 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177}
12178
12179PyObject*
12180PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12181{
12182 unsigned char *data;
12183 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012184 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185
Victor Stinnerde636f32011-10-01 03:55:54 +020012186 if (PyUnicode_READY(self) == -1)
12187 return NULL;
12188
Victor Stinner684d5fd2012-05-03 02:32:34 +020012189 length = PyUnicode_GET_LENGTH(self);
12190 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012191
Victor Stinner684d5fd2012-05-03 02:32:34 +020012192 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012193 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194
Victor Stinnerde636f32011-10-01 03:55:54 +020012195 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012196 PyErr_SetString(PyExc_IndexError, "string index out of range");
12197 return NULL;
12198 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012199 if (start >= length || end < start)
12200 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012201
Victor Stinner684d5fd2012-05-03 02:32:34 +020012202 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012203 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012204 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012205 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012206 }
12207 else {
12208 kind = PyUnicode_KIND(self);
12209 data = PyUnicode_1BYTE_DATA(self);
12210 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012211 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012212 length);
12213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215
12216static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012217do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 Py_ssize_t len, i, j;
12220
12221 if (PyUnicode_READY(self) == -1)
12222 return NULL;
12223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012225
Victor Stinnercc7af722013-04-09 22:39:24 +020012226 if (PyUnicode_IS_ASCII(self)) {
12227 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12228
12229 i = 0;
12230 if (striptype != RIGHTSTRIP) {
12231 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012232 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012233 if (!_Py_ascii_whitespace[ch])
12234 break;
12235 i++;
12236 }
12237 }
12238
12239 j = len;
12240 if (striptype != LEFTSTRIP) {
12241 j--;
12242 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012243 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012244 if (!_Py_ascii_whitespace[ch])
12245 break;
12246 j--;
12247 }
12248 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 }
12250 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012251 else {
12252 int kind = PyUnicode_KIND(self);
12253 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012254
Victor Stinnercc7af722013-04-09 22:39:24 +020012255 i = 0;
12256 if (striptype != RIGHTSTRIP) {
12257 while (i < len) {
12258 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12259 if (!Py_UNICODE_ISSPACE(ch))
12260 break;
12261 i++;
12262 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012263 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012264
12265 j = len;
12266 if (striptype != LEFTSTRIP) {
12267 j--;
12268 while (j >= i) {
12269 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12270 if (!Py_UNICODE_ISSPACE(ch))
12271 break;
12272 j--;
12273 }
12274 j++;
12275 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012276 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012277
Victor Stinner7931d9a2011-11-04 00:22:48 +010012278 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281
12282static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012283do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012285 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286
Serhiy Storchakac6792272013-10-19 21:03:34 +030012287 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012288 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289
Benjamin Peterson14339b62009-01-31 16:36:08 +000012290 if (sep != NULL && sep != Py_None) {
12291 if (PyUnicode_Check(sep))
12292 return _PyUnicode_XStrip(self, striptype, sep);
12293 else {
12294 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 "%s arg must be None or str",
12296 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 return NULL;
12298 }
12299 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012300
Benjamin Peterson14339b62009-01-31 16:36:08 +000012301 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302}
12303
12304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012305PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307\n\
12308Return a copy of the string S with leading and trailing\n\
12309whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012310If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311
12312static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012313unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012314{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315 if (PyTuple_GET_SIZE(args) == 0)
12316 return do_strip(self, BOTHSTRIP); /* Common case */
12317 else
12318 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319}
12320
12321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012322PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012324\n\
12325Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012326If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012327
12328static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012329unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 if (PyTuple_GET_SIZE(args) == 0)
12332 return do_strip(self, LEFTSTRIP); /* Common case */
12333 else
12334 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012335}
12336
12337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012338PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340\n\
12341Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012342If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012343
12344static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012345unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012346{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012347 if (PyTuple_GET_SIZE(args) == 0)
12348 return do_strip(self, RIGHTSTRIP); /* Common case */
12349 else
12350 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012351}
12352
12353
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012355unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012357 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359
Serhiy Storchaka05997252013-01-26 12:14:02 +020012360 if (len < 1)
12361 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
Victor Stinnerc4b49542011-12-11 22:44:26 +010012363 /* no repeat, return original string */
12364 if (len == 1)
12365 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012366
Benjamin Petersonbac79492012-01-14 13:34:47 -050012367 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 return NULL;
12369
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012370 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012371 PyErr_SetString(PyExc_OverflowError,
12372 "repeated string is too long");
12373 return NULL;
12374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012376
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012377 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378 if (!u)
12379 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012380 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 if (PyUnicode_GET_LENGTH(str) == 1) {
12383 const int kind = PyUnicode_KIND(str);
12384 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012385 if (kind == PyUnicode_1BYTE_KIND) {
12386 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012387 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012388 }
12389 else if (kind == PyUnicode_2BYTE_KIND) {
12390 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012391 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012392 ucs2[n] = fill_char;
12393 } else {
12394 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12395 assert(kind == PyUnicode_4BYTE_KIND);
12396 for (n = 0; n < len; ++n)
12397 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 }
12400 else {
12401 /* number of characters copied this far */
12402 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012403 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 char *to = (char *) PyUnicode_DATA(u);
12405 Py_MEMCPY(to, PyUnicode_DATA(str),
12406 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 n = (done <= nchars-done) ? done : nchars-done;
12409 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012410 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412 }
12413
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012414 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012415 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416}
12417
Alexander Belopolsky40018472011-02-26 01:02:56 +000012418PyObject *
12419PyUnicode_Replace(PyObject *obj,
12420 PyObject *subobj,
12421 PyObject *replobj,
12422 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423{
12424 PyObject *self;
12425 PyObject *str1;
12426 PyObject *str2;
12427 PyObject *result;
12428
12429 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012430 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012433 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 Py_DECREF(self);
12435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436 }
12437 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012438 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 Py_DECREF(self);
12440 Py_DECREF(str1);
12441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012443 if (PyUnicode_READY(self) == -1 ||
12444 PyUnicode_READY(str1) == -1 ||
12445 PyUnicode_READY(str2) == -1)
12446 result = NULL;
12447 else
12448 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449 Py_DECREF(self);
12450 Py_DECREF(str1);
12451 Py_DECREF(str2);
12452 return result;
12453}
12454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012455PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012456 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457\n\
12458Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012459old replaced by new. If the optional argument count is\n\
12460given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
12462static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 PyObject *str1;
12466 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012467 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468 PyObject *result;
12469
Martin v. Löwis18e16552006-02-15 17:27:45 +000012470 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012472 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012475 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 return NULL;
12477 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012478 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 Py_DECREF(str1);
12480 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012481 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012482 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12483 result = NULL;
12484 else
12485 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486
12487 Py_DECREF(str1);
12488 Py_DECREF(str2);
12489 return result;
12490}
12491
Alexander Belopolsky40018472011-02-26 01:02:56 +000012492static PyObject *
12493unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012495 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 Py_ssize_t isize;
12497 Py_ssize_t osize, squote, dquote, i, o;
12498 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012499 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012503 return NULL;
12504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 isize = PyUnicode_GET_LENGTH(unicode);
12506 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 /* Compute length of output, quote characters, and
12509 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012510 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 max = 127;
12512 squote = dquote = 0;
12513 ikind = PyUnicode_KIND(unicode);
12514 for (i = 0; i < isize; i++) {
12515 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012516 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012518 case '\'': squote++; break;
12519 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012521 incr = 2;
12522 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 default:
12524 /* Fast-path ASCII */
12525 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012526 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012528 ;
12529 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012532 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012534 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012536 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012538 if (osize > PY_SSIZE_T_MAX - incr) {
12539 PyErr_SetString(PyExc_OverflowError,
12540 "string is too long to generate repr");
12541 return NULL;
12542 }
12543 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 }
12545
12546 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012547 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012549 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (dquote)
12551 /* Both squote and dquote present. Use squote,
12552 and escape them */
12553 osize += squote;
12554 else
12555 quote = '"';
12556 }
Victor Stinner55c08782013-04-14 18:45:39 +020012557 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558
12559 repr = PyUnicode_New(osize, max);
12560 if (repr == NULL)
12561 return NULL;
12562 okind = PyUnicode_KIND(repr);
12563 odata = PyUnicode_DATA(repr);
12564
12565 PyUnicode_WRITE(okind, odata, 0, quote);
12566 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012567 if (unchanged) {
12568 _PyUnicode_FastCopyCharacters(repr, 1,
12569 unicode, 0,
12570 isize);
12571 }
12572 else {
12573 for (i = 0, o = 1; i < isize; i++) {
12574 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575
Victor Stinner55c08782013-04-14 18:45:39 +020012576 /* Escape quotes and backslashes */
12577 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012578 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012580 continue;
12581 }
12582
12583 /* Map special whitespace to '\t', \n', '\r' */
12584 if (ch == '\t') {
12585 PyUnicode_WRITE(okind, odata, o++, '\\');
12586 PyUnicode_WRITE(okind, odata, o++, 't');
12587 }
12588 else if (ch == '\n') {
12589 PyUnicode_WRITE(okind, odata, o++, '\\');
12590 PyUnicode_WRITE(okind, odata, o++, 'n');
12591 }
12592 else if (ch == '\r') {
12593 PyUnicode_WRITE(okind, odata, o++, '\\');
12594 PyUnicode_WRITE(okind, odata, o++, 'r');
12595 }
12596
12597 /* Map non-printable US ASCII to '\xhh' */
12598 else if (ch < ' ' || ch == 0x7F) {
12599 PyUnicode_WRITE(okind, odata, o++, '\\');
12600 PyUnicode_WRITE(okind, odata, o++, 'x');
12601 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12602 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12603 }
12604
12605 /* Copy ASCII characters as-is */
12606 else if (ch < 0x7F) {
12607 PyUnicode_WRITE(okind, odata, o++, ch);
12608 }
12609
12610 /* Non-ASCII characters */
12611 else {
12612 /* Map Unicode whitespace and control characters
12613 (categories Z* and C* except ASCII space)
12614 */
12615 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12616 PyUnicode_WRITE(okind, odata, o++, '\\');
12617 /* Map 8-bit characters to '\xhh' */
12618 if (ch <= 0xff) {
12619 PyUnicode_WRITE(okind, odata, o++, 'x');
12620 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12621 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12622 }
12623 /* Map 16-bit characters to '\uxxxx' */
12624 else if (ch <= 0xffff) {
12625 PyUnicode_WRITE(okind, odata, o++, 'u');
12626 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12627 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12628 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12629 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12630 }
12631 /* Map 21-bit characters to '\U00xxxxxx' */
12632 else {
12633 PyUnicode_WRITE(okind, odata, o++, 'U');
12634 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12635 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12636 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12637 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12638 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12639 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12640 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12641 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12642 }
12643 }
12644 /* Copy characters as-is */
12645 else {
12646 PyUnicode_WRITE(okind, odata, o++, ch);
12647 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012648 }
12649 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012652 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012653 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654}
12655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012656PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658\n\
12659Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012660such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661arguments start and end are interpreted as in slice notation.\n\
12662\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012663Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664
12665static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012668 /* initialize variables to prevent gcc warning */
12669 PyObject *substring = NULL;
12670 Py_ssize_t start = 0;
12671 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012672 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673
Jesus Ceaac451502011-04-20 17:09:23 +020012674 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12675 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
Christian Heimesea71a522013-06-29 21:17:34 +020012678 if (PyUnicode_READY(self) == -1) {
12679 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012681 }
12682 if (PyUnicode_READY(substring) == -1) {
12683 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686
Victor Stinner7931d9a2011-11-04 00:22:48 +010012687 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688
12689 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 if (result == -2)
12692 return NULL;
12693
Christian Heimes217cfd12007-12-02 14:31:20 +000012694 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695}
12696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012697PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012700Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701
12702static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012705 /* initialize variables to prevent gcc warning */
12706 PyObject *substring = NULL;
12707 Py_ssize_t start = 0;
12708 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012709 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710
Jesus Ceaac451502011-04-20 17:09:23 +020012711 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12712 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
Christian Heimesea71a522013-06-29 21:17:34 +020012715 if (PyUnicode_READY(self) == -1) {
12716 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012718 }
12719 if (PyUnicode_READY(substring) == -1) {
12720 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723
Victor Stinner7931d9a2011-11-04 00:22:48 +010012724 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725
12726 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 if (result == -2)
12729 return NULL;
12730
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731 if (result < 0) {
12732 PyErr_SetString(PyExc_ValueError, "substring not found");
12733 return NULL;
12734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735
Christian Heimes217cfd12007-12-02 14:31:20 +000012736 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737}
12738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012739PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012742Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012743done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
12745static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012746unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012748 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 Py_UCS4 fillchar = ' ';
12750
Victor Stinnere9a29352011-10-01 02:14:59 +020012751 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012753
Benjamin Petersonbac79492012-01-14 13:34:47 -050012754 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755 return NULL;
12756
Victor Stinnerc4b49542011-12-11 22:44:26 +010012757 if (PyUnicode_GET_LENGTH(self) >= width)
12758 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759
Victor Stinnerc4b49542011-12-11 22:44:26 +010012760 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761}
12762
Alexander Belopolsky40018472011-02-26 01:02:56 +000012763PyObject *
12764PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765{
12766 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012767
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768 s = PyUnicode_FromObject(s);
12769 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012771 if (sep != NULL) {
12772 sep = PyUnicode_FromObject(sep);
12773 if (sep == NULL) {
12774 Py_DECREF(s);
12775 return NULL;
12776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777 }
12778
Victor Stinner9310abb2011-10-05 00:59:23 +020012779 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
12781 Py_DECREF(s);
12782 Py_XDECREF(sep);
12783 return result;
12784}
12785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012786PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012787 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788\n\
12789Return a list of the words in S, using sep as the\n\
12790delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012791splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012792whitespace string is a separator and empty strings are\n\
12793removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
12795static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012796unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012798 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012800 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012802 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12803 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804 return NULL;
12805
12806 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012809 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012811 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812}
12813
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814PyObject *
12815PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12816{
12817 PyObject* str_obj;
12818 PyObject* sep_obj;
12819 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012820 int kind1, kind2;
12821 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823
12824 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012825 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012827 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012828 if (!sep_obj) {
12829 Py_DECREF(str_obj);
12830 return NULL;
12831 }
12832 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12833 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012834 Py_DECREF(str_obj);
12835 return NULL;
12836 }
12837
Victor Stinner14f8f022011-10-05 20:58:25 +020012838 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 len1 = PyUnicode_GET_LENGTH(str_obj);
12841 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012842 if (kind1 < kind2 || len1 < len2) {
12843 _Py_INCREF_UNICODE_EMPTY();
12844 if (!unicode_empty)
12845 out = NULL;
12846 else {
12847 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12848 Py_DECREF(unicode_empty);
12849 }
12850 Py_DECREF(sep_obj);
12851 Py_DECREF(str_obj);
12852 return out;
12853 }
12854 buf1 = PyUnicode_DATA(str_obj);
12855 buf2 = PyUnicode_DATA(sep_obj);
12856 if (kind2 != kind1) {
12857 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12858 if (!buf2)
12859 goto onError;
12860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012862 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012864 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12865 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12866 else
12867 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868 break;
12869 case PyUnicode_2BYTE_KIND:
12870 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12871 break;
12872 case PyUnicode_4BYTE_KIND:
12873 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12874 break;
12875 default:
12876 assert(0);
12877 out = 0;
12878 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012879
12880 Py_DECREF(sep_obj);
12881 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012882 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012884
12885 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 onError:
12887 Py_DECREF(sep_obj);
12888 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012889 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 PyMem_Free(buf2);
12891 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012892}
12893
12894
12895PyObject *
12896PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12897{
12898 PyObject* str_obj;
12899 PyObject* sep_obj;
12900 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012901 int kind1, kind2;
12902 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012904
12905 str_obj = PyUnicode_FromObject(str_in);
12906 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908 sep_obj = PyUnicode_FromObject(sep_in);
12909 if (!sep_obj) {
12910 Py_DECREF(str_obj);
12911 return NULL;
12912 }
12913
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012914 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 len1 = PyUnicode_GET_LENGTH(str_obj);
12917 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012918 if (kind1 < kind2 || len1 < len2) {
12919 _Py_INCREF_UNICODE_EMPTY();
12920 if (!unicode_empty)
12921 out = NULL;
12922 else {
12923 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12924 Py_DECREF(unicode_empty);
12925 }
12926 Py_DECREF(sep_obj);
12927 Py_DECREF(str_obj);
12928 return out;
12929 }
12930 buf1 = PyUnicode_DATA(str_obj);
12931 buf2 = PyUnicode_DATA(sep_obj);
12932 if (kind2 != kind1) {
12933 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12934 if (!buf2)
12935 goto onError;
12936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012938 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012940 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12941 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942 else
12943 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 break;
12945 case PyUnicode_2BYTE_KIND:
12946 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12947 break;
12948 case PyUnicode_4BYTE_KIND:
12949 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12950 break;
12951 default:
12952 assert(0);
12953 out = 0;
12954 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012955
12956 Py_DECREF(sep_obj);
12957 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012958 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012960
12961 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 onError:
12963 Py_DECREF(sep_obj);
12964 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012965 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 PyMem_Free(buf2);
12967 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968}
12969
12970PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012973Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012975found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976
12977static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012978unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979{
Victor Stinner9310abb2011-10-05 00:59:23 +020012980 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981}
12982
12983PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012984 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012986Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012988separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989
12990static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012991unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992{
Victor Stinner9310abb2011-10-05 00:59:23 +020012993 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994}
12995
Alexander Belopolsky40018472011-02-26 01:02:56 +000012996PyObject *
12997PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012998{
12999 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013001 s = PyUnicode_FromObject(s);
13002 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 if (sep != NULL) {
13005 sep = PyUnicode_FromObject(sep);
13006 if (sep == NULL) {
13007 Py_DECREF(s);
13008 return NULL;
13009 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013010 }
13011
Victor Stinner9310abb2011-10-05 00:59:23 +020013012 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013013
13014 Py_DECREF(s);
13015 Py_XDECREF(sep);
13016 return result;
13017}
13018
13019PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013020 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013021\n\
13022Return a list of the words in S, using sep as the\n\
13023delimiter string, starting at the end of the string and\n\
13024working to the front. If maxsplit is given, at most maxsplit\n\
13025splits are done. If sep is not specified, any whitespace string\n\
13026is a separator.");
13027
13028static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013029unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013030{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013031 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013032 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013033 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013034
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013035 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13036 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013037 return NULL;
13038
13039 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013040 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013041 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013042 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013043 else
Victor Stinner9310abb2011-10-05 00:59:23 +020013044 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013045}
13046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013047PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049\n\
13050Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013051Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013052is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053
13054static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013055unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013057 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013058 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013060 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13061 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062 return NULL;
13063
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013064 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065}
13066
13067static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013068PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013070 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071}
13072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013073PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075\n\
13076Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013077and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078
13079static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013080unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013082 if (PyUnicode_READY(self) == -1)
13083 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013084 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085}
13086
Larry Hastings61272b72014-01-07 12:41:53 -080013087/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013088
Larry Hastings31826802013-10-19 00:09:25 -070013089@staticmethod
13090str.maketrans as unicode_maketrans
13091
13092 x: object
13093
13094 y: unicode=NULL
13095
13096 z: unicode=NULL
13097
13098 /
13099
13100Return a translation table usable for str.translate().
13101
13102If there is only one argument, it must be a dictionary mapping Unicode
13103ordinals (integers) or characters to Unicode ordinals, strings or None.
13104Character keys will be then converted to ordinals.
13105If there are two arguments, they must be strings of equal length, and
13106in the resulting dictionary, each character in x will be mapped to the
13107character at the same position in y. If there is a third argument, it
13108must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013109[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013110
Larry Hastings31826802013-10-19 00:09:25 -070013111static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013112unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013113/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013114{
Georg Brandlceee0772007-11-27 23:48:05 +000013115 PyObject *new = NULL, *key, *value;
13116 Py_ssize_t i = 0;
13117 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013118
Georg Brandlceee0772007-11-27 23:48:05 +000013119 new = PyDict_New();
13120 if (!new)
13121 return NULL;
13122 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 int x_kind, y_kind, z_kind;
13124 void *x_data, *y_data, *z_data;
13125
Georg Brandlceee0772007-11-27 23:48:05 +000013126 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013127 if (!PyUnicode_Check(x)) {
13128 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13129 "be a string if there is a second argument");
13130 goto err;
13131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013133 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13134 "arguments must have equal length");
13135 goto err;
13136 }
13137 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 x_kind = PyUnicode_KIND(x);
13139 y_kind = PyUnicode_KIND(y);
13140 x_data = PyUnicode_DATA(x);
13141 y_data = PyUnicode_DATA(y);
13142 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13143 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013144 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013145 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013146 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013147 if (!value) {
13148 Py_DECREF(key);
13149 goto err;
13150 }
Georg Brandlceee0772007-11-27 23:48:05 +000013151 res = PyDict_SetItem(new, key, value);
13152 Py_DECREF(key);
13153 Py_DECREF(value);
13154 if (res < 0)
13155 goto err;
13156 }
13157 /* create entries for deleting chars in z */
13158 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 z_kind = PyUnicode_KIND(z);
13160 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013161 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013163 if (!key)
13164 goto err;
13165 res = PyDict_SetItem(new, key, Py_None);
13166 Py_DECREF(key);
13167 if (res < 0)
13168 goto err;
13169 }
13170 }
13171 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 int kind;
13173 void *data;
13174
Georg Brandlceee0772007-11-27 23:48:05 +000013175 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013176 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013177 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13178 "to maketrans it must be a dict");
13179 goto err;
13180 }
13181 /* copy entries into the new dict, converting string keys to int keys */
13182 while (PyDict_Next(x, &i, &key, &value)) {
13183 if (PyUnicode_Check(key)) {
13184 /* convert string keys to integer keys */
13185 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013186 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013187 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13188 "table must be of length 1");
13189 goto err;
13190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 kind = PyUnicode_KIND(key);
13192 data = PyUnicode_DATA(key);
13193 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013194 if (!newkey)
13195 goto err;
13196 res = PyDict_SetItem(new, newkey, value);
13197 Py_DECREF(newkey);
13198 if (res < 0)
13199 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013200 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013201 /* just keep integer keys */
13202 if (PyDict_SetItem(new, key, value) < 0)
13203 goto err;
13204 } else {
13205 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13206 "be strings or integers");
13207 goto err;
13208 }
13209 }
13210 }
13211 return new;
13212 err:
13213 Py_DECREF(new);
13214 return NULL;
13215}
13216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013217PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013220Return a copy of the string S in which each character has been mapped\n\
13221through the given translation table. The table must implement\n\
13222lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13223mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13224this operation raises LookupError, the character is left untouched.\n\
13225Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013226
13227static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231}
13232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013233PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013236Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237
13238static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013239unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013241 if (PyUnicode_READY(self) == -1)
13242 return NULL;
13243 if (PyUnicode_IS_ASCII(self))
13244 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013245 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246}
13247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013248PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013251Pad a numeric string S with zeros on the left, to fill a field\n\
13252of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253
13254static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013255unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013257 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013258 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013259 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 int kind;
13261 void *data;
13262 Py_UCS4 chr;
13263
Martin v. Löwis18e16552006-02-15 17:27:45 +000013264 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265 return NULL;
13266
Benjamin Petersonbac79492012-01-14 13:34:47 -050013267 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269
Victor Stinnerc4b49542011-12-11 22:44:26 +010013270 if (PyUnicode_GET_LENGTH(self) >= width)
13271 return unicode_result_unchanged(self);
13272
13273 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
13275 u = pad(self, fill, 0, '0');
13276
Walter Dörwald068325e2002-04-15 13:36:47 +000013277 if (u == NULL)
13278 return NULL;
13279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 kind = PyUnicode_KIND(u);
13281 data = PyUnicode_DATA(u);
13282 chr = PyUnicode_READ(kind, data, fill);
13283
13284 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 PyUnicode_WRITE(kind, data, 0, chr);
13287 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013288 }
13289
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013290 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013291 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293
13294#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013295static PyObject *
13296unicode__decimal2ascii(PyObject *self)
13297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013299}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300#endif
13301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013302PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013305Return True if S starts with the specified prefix, False otherwise.\n\
13306With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013307With optional end, stop comparing S at that position.\n\
13308prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
13310static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013311unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013314 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013315 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013316 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013317 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013318 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319
Jesus Ceaac451502011-04-20 17:09:23 +020013320 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013322 if (PyTuple_Check(subobj)) {
13323 Py_ssize_t i;
13324 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013325 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013326 if (substring == NULL)
13327 return NULL;
13328 result = tailmatch(self, substring, start, end, -1);
13329 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013330 if (result == -1)
13331 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013332 if (result) {
13333 Py_RETURN_TRUE;
13334 }
13335 }
13336 /* nothing matched */
13337 Py_RETURN_FALSE;
13338 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013339 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013340 if (substring == NULL) {
13341 if (PyErr_ExceptionMatches(PyExc_TypeError))
13342 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13343 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013344 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013345 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013346 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013348 if (result == -1)
13349 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013350 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351}
13352
13353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013354PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013355 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013357Return True if S ends with the specified suffix, False otherwise.\n\
13358With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013359With optional end, stop comparing S at that position.\n\
13360suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361
13362static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013363unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013366 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013367 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013368 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013369 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013370 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371
Jesus Ceaac451502011-04-20 17:09:23 +020013372 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013374 if (PyTuple_Check(subobj)) {
13375 Py_ssize_t i;
13376 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013377 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013381 result = tailmatch(self, substring, start, end, +1);
13382 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013383 if (result == -1)
13384 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013385 if (result) {
13386 Py_RETURN_TRUE;
13387 }
13388 }
13389 Py_RETURN_FALSE;
13390 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013391 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013392 if (substring == NULL) {
13393 if (PyErr_ExceptionMatches(PyExc_TypeError))
13394 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13395 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013397 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013398 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013399 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013400 if (result == -1)
13401 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013402 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403}
13404
Victor Stinner202fdca2012-05-07 12:47:02 +020013405Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013406_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013407{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013408 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13409 writer->data = PyUnicode_DATA(writer->buffer);
13410
13411 if (!writer->readonly) {
13412 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013413 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013414 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013415 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013416 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13417 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13418 writer->kind = PyUnicode_WCHAR_KIND;
13419 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13420
Victor Stinner8f674cc2013-04-17 23:02:17 +020013421 /* Copy-on-write mode: set buffer size to 0 so
13422 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13423 * next write. */
13424 writer->size = 0;
13425 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013426}
13427
Victor Stinnerd3f08822012-05-29 12:57:52 +020013428void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013429_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013430{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013431 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013432
13433 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013434 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013435
13436 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13437 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13438 writer->kind = PyUnicode_WCHAR_KIND;
13439 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013440}
13441
Victor Stinnerd3f08822012-05-29 12:57:52 +020013442int
13443_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13444 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013445{
13446 Py_ssize_t newlen;
13447 PyObject *newbuffer;
13448
Victor Stinnerca9381e2015-09-22 00:58:32 +020013449 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013450 assert((maxchar > writer->maxchar && length >= 0)
13451 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013452
Victor Stinner202fdca2012-05-07 12:47:02 +020013453 if (length > PY_SSIZE_T_MAX - writer->pos) {
13454 PyErr_NoMemory();
13455 return -1;
13456 }
13457 newlen = writer->pos + length;
13458
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013459 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013460
Victor Stinnerd3f08822012-05-29 12:57:52 +020013461 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013462 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013463 if (writer->overallocate
13464 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13465 /* overallocate to limit the number of realloc() */
13466 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013467 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013468 if (newlen < writer->min_length)
13469 newlen = writer->min_length;
13470
Victor Stinnerd3f08822012-05-29 12:57:52 +020013471 writer->buffer = PyUnicode_New(newlen, maxchar);
13472 if (writer->buffer == NULL)
13473 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013474 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013475 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013476 if (writer->overallocate
13477 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13478 /* overallocate to limit the number of realloc() */
13479 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013480 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013481 if (newlen < writer->min_length)
13482 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013483
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013484 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013485 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013486 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013487 newbuffer = PyUnicode_New(newlen, maxchar);
13488 if (newbuffer == NULL)
13489 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013490 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13491 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013492 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013493 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013494 }
13495 else {
13496 newbuffer = resize_compact(writer->buffer, newlen);
13497 if (newbuffer == NULL)
13498 return -1;
13499 }
13500 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013501 }
13502 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013503 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 newbuffer = PyUnicode_New(writer->size, maxchar);
13505 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13508 writer->buffer, 0, writer->pos);
13509 Py_DECREF(writer->buffer);
13510 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013511 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013512 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013513 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013514
13515#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013516}
13517
Victor Stinnerca9381e2015-09-22 00:58:32 +020013518int
13519_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13520 enum PyUnicode_Kind kind)
13521{
13522 Py_UCS4 maxchar;
13523
13524 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13525 assert(writer->kind < kind);
13526
13527 switch (kind)
13528 {
13529 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13530 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13531 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13532 default:
13533 assert(0 && "invalid kind");
13534 return -1;
13535 }
13536
13537 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13538}
13539
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013540Py_LOCAL_INLINE(int)
13541_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013542{
13543 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13544 return -1;
13545 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13546 writer->pos++;
13547 return 0;
13548}
13549
13550int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013551_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13552{
13553 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13554}
13555
13556int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13558{
13559 Py_UCS4 maxchar;
13560 Py_ssize_t len;
13561
13562 if (PyUnicode_READY(str) == -1)
13563 return -1;
13564 len = PyUnicode_GET_LENGTH(str);
13565 if (len == 0)
13566 return 0;
13567 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13568 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013569 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013570 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013571 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013572 Py_INCREF(str);
13573 writer->buffer = str;
13574 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013575 writer->pos += len;
13576 return 0;
13577 }
13578 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13579 return -1;
13580 }
13581 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13582 str, 0, len);
13583 writer->pos += len;
13584 return 0;
13585}
13586
Victor Stinnere215d962012-10-06 23:03:36 +020013587int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013588_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13589 Py_ssize_t start, Py_ssize_t end)
13590{
13591 Py_UCS4 maxchar;
13592 Py_ssize_t len;
13593
13594 if (PyUnicode_READY(str) == -1)
13595 return -1;
13596
13597 assert(0 <= start);
13598 assert(end <= PyUnicode_GET_LENGTH(str));
13599 assert(start <= end);
13600
13601 if (end == 0)
13602 return 0;
13603
13604 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13605 return _PyUnicodeWriter_WriteStr(writer, str);
13606
13607 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13608 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13609 else
13610 maxchar = writer->maxchar;
13611 len = end - start;
13612
13613 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13614 return -1;
13615
13616 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13617 str, start, len);
13618 writer->pos += len;
13619 return 0;
13620}
13621
13622int
Victor Stinner4a587072013-11-19 12:54:53 +010013623_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13624 const char *ascii, Py_ssize_t len)
13625{
13626 if (len == -1)
13627 len = strlen(ascii);
13628
13629 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13630
13631 if (writer->buffer == NULL && !writer->overallocate) {
13632 PyObject *str;
13633
13634 str = _PyUnicode_FromASCII(ascii, len);
13635 if (str == NULL)
13636 return -1;
13637
13638 writer->readonly = 1;
13639 writer->buffer = str;
13640 _PyUnicodeWriter_Update(writer);
13641 writer->pos += len;
13642 return 0;
13643 }
13644
13645 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13646 return -1;
13647
13648 switch (writer->kind)
13649 {
13650 case PyUnicode_1BYTE_KIND:
13651 {
13652 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13653 Py_UCS1 *data = writer->data;
13654
13655 Py_MEMCPY(data + writer->pos, str, len);
13656 break;
13657 }
13658 case PyUnicode_2BYTE_KIND:
13659 {
13660 _PyUnicode_CONVERT_BYTES(
13661 Py_UCS1, Py_UCS2,
13662 ascii, ascii + len,
13663 (Py_UCS2 *)writer->data + writer->pos);
13664 break;
13665 }
13666 case PyUnicode_4BYTE_KIND:
13667 {
13668 _PyUnicode_CONVERT_BYTES(
13669 Py_UCS1, Py_UCS4,
13670 ascii, ascii + len,
13671 (Py_UCS4 *)writer->data + writer->pos);
13672 break;
13673 }
13674 default:
13675 assert(0);
13676 }
13677
13678 writer->pos += len;
13679 return 0;
13680}
13681
13682int
13683_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13684 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013685{
13686 Py_UCS4 maxchar;
13687
13688 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13689 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13690 return -1;
13691 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13692 writer->pos += len;
13693 return 0;
13694}
13695
Victor Stinnerd3f08822012-05-29 12:57:52 +020013696PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013697_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013698{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013699 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013700 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013701 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013703 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013704 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013705 str = writer->buffer;
13706 writer->buffer = NULL;
13707 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13708 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013709 }
13710 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13711 PyObject *newbuffer;
13712 newbuffer = resize_compact(writer->buffer, writer->pos);
13713 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013714 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013715 return NULL;
13716 }
13717 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013718 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013719 str = writer->buffer;
13720 writer->buffer = NULL;
13721 assert(_PyUnicode_CheckConsistency(str, 1));
13722 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013723}
13724
Victor Stinnerd3f08822012-05-29 12:57:52 +020013725void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013726_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013727{
13728 Py_CLEAR(writer->buffer);
13729}
13730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013732
13733PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013735\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013736Return a formatted version of S, using substitutions from args and kwargs.\n\
13737The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013738
Eric Smith27bbca62010-11-04 17:06:58 +000013739PyDoc_STRVAR(format_map__doc__,
13740 "S.format_map(mapping) -> str\n\
13741\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013742Return a formatted version of S, using substitutions from mapping.\n\
13743The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013744
Eric Smith4a7d76d2008-05-30 18:10:19 +000013745static PyObject *
13746unicode__format__(PyObject* self, PyObject* args)
13747{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013748 PyObject *format_spec;
13749 _PyUnicodeWriter writer;
13750 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013751
13752 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13753 return NULL;
13754
Victor Stinnerd3f08822012-05-29 12:57:52 +020013755 if (PyUnicode_READY(self) == -1)
13756 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013757 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013758 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13759 self, format_spec, 0,
13760 PyUnicode_GET_LENGTH(format_spec));
13761 if (ret == -1) {
13762 _PyUnicodeWriter_Dealloc(&writer);
13763 return NULL;
13764 }
13765 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013766}
13767
Eric Smith8c663262007-08-25 02:26:07 +000013768PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013770\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013771Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013772
13773static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013774unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013776 Py_ssize_t size;
13777
13778 /* If it's a compact object, account for base structure +
13779 character data. */
13780 if (PyUnicode_IS_COMPACT_ASCII(v))
13781 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13782 else if (PyUnicode_IS_COMPACT(v))
13783 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013784 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013785 else {
13786 /* If it is a two-block object, account for base object, and
13787 for character block if present. */
13788 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013789 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013790 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013791 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013792 }
13793 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013794 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013795 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013797 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013798 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799
13800 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013801}
13802
13803PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013804 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013805
13806static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013807unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013808{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013809 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013810 if (!copy)
13811 return NULL;
13812 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013813}
13814
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013816 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013817 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013818 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13819 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013820 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13821 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013822 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013823 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13824 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13825 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013826 {"expandtabs", (PyCFunction) unicode_expandtabs,
13827 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013828 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013829 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013830 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13831 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13832 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013833 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013834 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13835 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13836 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013837 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013838 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013839 {"splitlines", (PyCFunction) unicode_splitlines,
13840 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013841 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013842 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13843 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13844 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13845 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13846 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13847 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13848 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13849 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13850 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13851 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13852 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13853 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13854 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13855 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013856 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013857 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013858 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013859 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013860 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013861 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013862 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013863 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013864#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013865 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013866 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013867#endif
13868
Benjamin Peterson14339b62009-01-31 16:36:08 +000013869 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870 {NULL, NULL}
13871};
13872
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013873static PyObject *
13874unicode_mod(PyObject *v, PyObject *w)
13875{
Brian Curtindfc80e32011-08-10 20:28:54 -050013876 if (!PyUnicode_Check(v))
13877 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013879}
13880
13881static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013882 0, /*nb_add*/
13883 0, /*nb_subtract*/
13884 0, /*nb_multiply*/
13885 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013886};
13887
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013889 (lenfunc) unicode_length, /* sq_length */
13890 PyUnicode_Concat, /* sq_concat */
13891 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13892 (ssizeargfunc) unicode_getitem, /* sq_item */
13893 0, /* sq_slice */
13894 0, /* sq_ass_item */
13895 0, /* sq_ass_slice */
13896 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897};
13898
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013899static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013900unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902 if (PyUnicode_READY(self) == -1)
13903 return NULL;
13904
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013905 if (PyIndex_Check(item)) {
13906 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013907 if (i == -1 && PyErr_Occurred())
13908 return NULL;
13909 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013910 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013911 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013912 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013913 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013914 PyObject *result;
13915 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013916 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013917 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013919 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013920 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013921 return NULL;
13922 }
13923
13924 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013925 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013926 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013927 slicelength == PyUnicode_GET_LENGTH(self)) {
13928 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013929 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013930 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013931 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013932 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013933 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013934 src_kind = PyUnicode_KIND(self);
13935 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013936 if (!PyUnicode_IS_ASCII(self)) {
13937 kind_limit = kind_maxchar_limit(src_kind);
13938 max_char = 0;
13939 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13940 ch = PyUnicode_READ(src_kind, src_data, cur);
13941 if (ch > max_char) {
13942 max_char = ch;
13943 if (max_char >= kind_limit)
13944 break;
13945 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013946 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013947 }
Victor Stinner55c99112011-10-13 01:17:06 +020013948 else
13949 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013950 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013951 if (result == NULL)
13952 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013953 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013954 dest_data = PyUnicode_DATA(result);
13955
13956 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013957 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13958 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013959 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013960 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013961 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013962 } else {
13963 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13964 return NULL;
13965 }
13966}
13967
13968static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 (lenfunc)unicode_length, /* mp_length */
13970 (binaryfunc)unicode_subscript, /* mp_subscript */
13971 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013972};
13973
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974
Guido van Rossumd57fd912000-03-10 22:53:23 +000013975/* Helpers for PyUnicode_Format() */
13976
Victor Stinnera47082312012-10-04 02:19:54 +020013977struct unicode_formatter_t {
13978 PyObject *args;
13979 int args_owned;
13980 Py_ssize_t arglen, argidx;
13981 PyObject *dict;
13982
13983 enum PyUnicode_Kind fmtkind;
13984 Py_ssize_t fmtcnt, fmtpos;
13985 void *fmtdata;
13986 PyObject *fmtstr;
13987
13988 _PyUnicodeWriter writer;
13989};
13990
13991struct unicode_format_arg_t {
13992 Py_UCS4 ch;
13993 int flags;
13994 Py_ssize_t width;
13995 int prec;
13996 int sign;
13997};
13998
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014000unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001{
Victor Stinnera47082312012-10-04 02:19:54 +020014002 Py_ssize_t argidx = ctx->argidx;
14003
14004 if (argidx < ctx->arglen) {
14005 ctx->argidx++;
14006 if (ctx->arglen < 0)
14007 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014008 else
Victor Stinnera47082312012-10-04 02:19:54 +020014009 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014010 }
14011 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014013 return NULL;
14014}
14015
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014016/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014017
Victor Stinnera47082312012-10-04 02:19:54 +020014018/* Format a float into the writer if the writer is not NULL, or into *p_output
14019 otherwise.
14020
14021 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014022static int
Victor Stinnera47082312012-10-04 02:19:54 +020014023formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14024 PyObject **p_output,
14025 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014026{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014027 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014028 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014029 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014030 int prec;
14031 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014032
Guido van Rossumd57fd912000-03-10 22:53:23 +000014033 x = PyFloat_AsDouble(v);
14034 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014035 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014036
Victor Stinnera47082312012-10-04 02:19:54 +020014037 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014038 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014039 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014040
Victor Stinnera47082312012-10-04 02:19:54 +020014041 if (arg->flags & F_ALT)
14042 dtoa_flags = Py_DTSF_ALT;
14043 else
14044 dtoa_flags = 0;
14045 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014046 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014047 return -1;
14048 len = strlen(p);
14049 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014050 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014051 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014052 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014053 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014054 }
14055 else
14056 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014057 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014058 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059}
14060
Victor Stinnerd0880d52012-04-27 23:40:13 +020014061/* formatlong() emulates the format codes d, u, o, x and X, and
14062 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14063 * Python's regular ints.
14064 * Return value: a new PyUnicodeObject*, or NULL if error.
14065 * The output string is of the form
14066 * "-"? ("0x" | "0X")? digit+
14067 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14068 * set in flags. The case of hex digits will be correct,
14069 * There will be at least prec digits, zero-filled on the left if
14070 * necessary to get that many.
14071 * val object to be converted
14072 * flags bitmask of format flags; only F_ALT is looked at
14073 * prec minimum number of digits; 0-fill on left if needed
14074 * type a character in [duoxX]; u acts the same as d
14075 *
14076 * CAUTION: o, x and X conversions on regular ints can never
14077 * produce a '-' sign, but can for Python's unbounded ints.
14078 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014079PyObject *
14080_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014081{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014082 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014084 Py_ssize_t i;
14085 int sign; /* 1 if '-', else 0 */
14086 int len; /* number of characters */
14087 Py_ssize_t llen;
14088 int numdigits; /* len == numnondigits + numdigits */
14089 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014090
Victor Stinnerd0880d52012-04-27 23:40:13 +020014091 /* Avoid exceeding SSIZE_T_MAX */
14092 if (prec > INT_MAX-3) {
14093 PyErr_SetString(PyExc_OverflowError,
14094 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014095 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014096 }
14097
14098 assert(PyLong_Check(val));
14099
14100 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014101 default:
14102 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014103 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014104 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014105 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014106 /* int and int subclasses should print numerically when a numeric */
14107 /* format code is used (see issue18780) */
14108 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014109 break;
14110 case 'o':
14111 numnondigits = 2;
14112 result = PyNumber_ToBase(val, 8);
14113 break;
14114 case 'x':
14115 case 'X':
14116 numnondigits = 2;
14117 result = PyNumber_ToBase(val, 16);
14118 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014119 }
14120 if (!result)
14121 return NULL;
14122
14123 assert(unicode_modifiable(result));
14124 assert(PyUnicode_IS_READY(result));
14125 assert(PyUnicode_IS_ASCII(result));
14126
14127 /* To modify the string in-place, there can only be one reference. */
14128 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014129 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014130 PyErr_BadInternalCall();
14131 return NULL;
14132 }
14133 buf = PyUnicode_DATA(result);
14134 llen = PyUnicode_GET_LENGTH(result);
14135 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014136 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014137 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014138 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014139 return NULL;
14140 }
14141 len = (int)llen;
14142 sign = buf[0] == '-';
14143 numnondigits += sign;
14144 numdigits = len - numnondigits;
14145 assert(numdigits > 0);
14146
14147 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014148 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014149 (type == 'o' || type == 'x' || type == 'X'))) {
14150 assert(buf[sign] == '0');
14151 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14152 buf[sign+1] == 'o');
14153 numnondigits -= 2;
14154 buf += 2;
14155 len -= 2;
14156 if (sign)
14157 buf[0] = '-';
14158 assert(len == numnondigits + numdigits);
14159 assert(numdigits > 0);
14160 }
14161
14162 /* Fill with leading zeroes to meet minimum width. */
14163 if (prec > numdigits) {
14164 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14165 numnondigits + prec);
14166 char *b1;
14167 if (!r1) {
14168 Py_DECREF(result);
14169 return NULL;
14170 }
14171 b1 = PyBytes_AS_STRING(r1);
14172 for (i = 0; i < numnondigits; ++i)
14173 *b1++ = *buf++;
14174 for (i = 0; i < prec - numdigits; i++)
14175 *b1++ = '0';
14176 for (i = 0; i < numdigits; i++)
14177 *b1++ = *buf++;
14178 *b1 = '\0';
14179 Py_DECREF(result);
14180 result = r1;
14181 buf = PyBytes_AS_STRING(result);
14182 len = numnondigits + prec;
14183 }
14184
14185 /* Fix up case for hex conversions. */
14186 if (type == 'X') {
14187 /* Need to convert all lower case letters to upper case.
14188 and need to convert 0x to 0X (and -0x to -0X). */
14189 for (i = 0; i < len; i++)
14190 if (buf[i] >= 'a' && buf[i] <= 'x')
14191 buf[i] -= 'a'-'A';
14192 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014193 if (!PyUnicode_Check(result)
14194 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014195 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014196 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014197 Py_DECREF(result);
14198 result = unicode;
14199 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014200 else if (len != PyUnicode_GET_LENGTH(result)) {
14201 if (PyUnicode_Resize(&result, len) < 0)
14202 Py_CLEAR(result);
14203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014204 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014205}
14206
Ethan Furmandf3ed242014-01-05 06:50:30 -080014207/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014208 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014209 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014210 * -1 and raise an exception on error */
14211static int
Victor Stinnera47082312012-10-04 02:19:54 +020014212mainformatlong(PyObject *v,
14213 struct unicode_format_arg_t *arg,
14214 PyObject **p_output,
14215 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014216{
14217 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014218 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014219
14220 if (!PyNumber_Check(v))
14221 goto wrongtype;
14222
Ethan Furman9ab74802014-03-21 06:38:46 -070014223 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014224 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014225 if (type == 'o' || type == 'x' || type == 'X') {
14226 iobj = PyNumber_Index(v);
14227 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014228 if (PyErr_ExceptionMatches(PyExc_TypeError))
14229 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014230 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014231 }
14232 }
14233 else {
14234 iobj = PyNumber_Long(v);
14235 if (iobj == NULL ) {
14236 if (PyErr_ExceptionMatches(PyExc_TypeError))
14237 goto wrongtype;
14238 return -1;
14239 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014240 }
14241 assert(PyLong_Check(iobj));
14242 }
14243 else {
14244 iobj = v;
14245 Py_INCREF(iobj);
14246 }
14247
14248 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014249 && arg->width == -1 && arg->prec == -1
14250 && !(arg->flags & (F_SIGN | F_BLANK))
14251 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014252 {
14253 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014254 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014255 int base;
14256
Victor Stinnera47082312012-10-04 02:19:54 +020014257 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014258 {
14259 default:
14260 assert(0 && "'type' not in [diuoxX]");
14261 case 'd':
14262 case 'i':
14263 case 'u':
14264 base = 10;
14265 break;
14266 case 'o':
14267 base = 8;
14268 break;
14269 case 'x':
14270 case 'X':
14271 base = 16;
14272 break;
14273 }
14274
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014275 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14276 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014278 }
14279 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 return 1;
14281 }
14282
Ethan Furmanb95b5612015-01-23 20:05:18 -080014283 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 Py_DECREF(iobj);
14285 if (res == NULL)
14286 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014287 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 return 0;
14289
14290wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014291 switch(type)
14292 {
14293 case 'o':
14294 case 'x':
14295 case 'X':
14296 PyErr_Format(PyExc_TypeError,
14297 "%%%c format: an integer is required, "
14298 "not %.200s",
14299 type, Py_TYPE(v)->tp_name);
14300 break;
14301 default:
14302 PyErr_Format(PyExc_TypeError,
14303 "%%%c format: a number is required, "
14304 "not %.200s",
14305 type, Py_TYPE(v)->tp_name);
14306 break;
14307 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014308 return -1;
14309}
14310
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014311static Py_UCS4
14312formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014313{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014314 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014315 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014316 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014317 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014318 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014319 goto onError;
14320 }
14321 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014322 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014323 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014324 /* make sure number is a type of integer */
14325 if (!PyLong_Check(v)) {
14326 iobj = PyNumber_Index(v);
14327 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014328 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014329 }
14330 v = iobj;
14331 Py_DECREF(iobj);
14332 }
14333 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014334 x = PyLong_AsLong(v);
14335 if (x == -1 && PyErr_Occurred())
14336 goto onError;
14337
Victor Stinner8faf8212011-12-08 22:14:11 +010014338 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014339 PyErr_SetString(PyExc_OverflowError,
14340 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014341 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014342 }
14343
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014344 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014346
Benjamin Peterson29060642009-01-31 22:14:21 +000014347 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014348 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014349 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014350 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014351}
14352
Victor Stinnera47082312012-10-04 02:19:54 +020014353/* Parse options of an argument: flags, width, precision.
14354 Handle also "%(name)" syntax.
14355
14356 Return 0 if the argument has been formatted into arg->str.
14357 Return 1 if the argument has been written into ctx->writer,
14358 Raise an exception and return -1 on error. */
14359static int
14360unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14361 struct unicode_format_arg_t *arg)
14362{
14363#define FORMAT_READ(ctx) \
14364 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14365
14366 PyObject *v;
14367
Victor Stinnera47082312012-10-04 02:19:54 +020014368 if (arg->ch == '(') {
14369 /* Get argument value from a dictionary. Example: "%(name)s". */
14370 Py_ssize_t keystart;
14371 Py_ssize_t keylen;
14372 PyObject *key;
14373 int pcount = 1;
14374
14375 if (ctx->dict == NULL) {
14376 PyErr_SetString(PyExc_TypeError,
14377 "format requires a mapping");
14378 return -1;
14379 }
14380 ++ctx->fmtpos;
14381 --ctx->fmtcnt;
14382 keystart = ctx->fmtpos;
14383 /* Skip over balanced parentheses */
14384 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14385 arg->ch = FORMAT_READ(ctx);
14386 if (arg->ch == ')')
14387 --pcount;
14388 else if (arg->ch == '(')
14389 ++pcount;
14390 ctx->fmtpos++;
14391 }
14392 keylen = ctx->fmtpos - keystart - 1;
14393 if (ctx->fmtcnt < 0 || pcount > 0) {
14394 PyErr_SetString(PyExc_ValueError,
14395 "incomplete format key");
14396 return -1;
14397 }
14398 key = PyUnicode_Substring(ctx->fmtstr,
14399 keystart, keystart + keylen);
14400 if (key == NULL)
14401 return -1;
14402 if (ctx->args_owned) {
14403 Py_DECREF(ctx->args);
14404 ctx->args_owned = 0;
14405 }
14406 ctx->args = PyObject_GetItem(ctx->dict, key);
14407 Py_DECREF(key);
14408 if (ctx->args == NULL)
14409 return -1;
14410 ctx->args_owned = 1;
14411 ctx->arglen = -1;
14412 ctx->argidx = -2;
14413 }
14414
14415 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014416 while (--ctx->fmtcnt >= 0) {
14417 arg->ch = FORMAT_READ(ctx);
14418 ctx->fmtpos++;
14419 switch (arg->ch) {
14420 case '-': arg->flags |= F_LJUST; continue;
14421 case '+': arg->flags |= F_SIGN; continue;
14422 case ' ': arg->flags |= F_BLANK; continue;
14423 case '#': arg->flags |= F_ALT; continue;
14424 case '0': arg->flags |= F_ZERO; continue;
14425 }
14426 break;
14427 }
14428
14429 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014430 if (arg->ch == '*') {
14431 v = unicode_format_getnextarg(ctx);
14432 if (v == NULL)
14433 return -1;
14434 if (!PyLong_Check(v)) {
14435 PyErr_SetString(PyExc_TypeError,
14436 "* wants int");
14437 return -1;
14438 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014439 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014440 if (arg->width == -1 && PyErr_Occurred())
14441 return -1;
14442 if (arg->width < 0) {
14443 arg->flags |= F_LJUST;
14444 arg->width = -arg->width;
14445 }
14446 if (--ctx->fmtcnt >= 0) {
14447 arg->ch = FORMAT_READ(ctx);
14448 ctx->fmtpos++;
14449 }
14450 }
14451 else if (arg->ch >= '0' && arg->ch <= '9') {
14452 arg->width = arg->ch - '0';
14453 while (--ctx->fmtcnt >= 0) {
14454 arg->ch = FORMAT_READ(ctx);
14455 ctx->fmtpos++;
14456 if (arg->ch < '0' || arg->ch > '9')
14457 break;
14458 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14459 mixing signed and unsigned comparison. Since arg->ch is between
14460 '0' and '9', casting to int is safe. */
14461 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14462 PyErr_SetString(PyExc_ValueError,
14463 "width too big");
14464 return -1;
14465 }
14466 arg->width = arg->width*10 + (arg->ch - '0');
14467 }
14468 }
14469
14470 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014471 if (arg->ch == '.') {
14472 arg->prec = 0;
14473 if (--ctx->fmtcnt >= 0) {
14474 arg->ch = FORMAT_READ(ctx);
14475 ctx->fmtpos++;
14476 }
14477 if (arg->ch == '*') {
14478 v = unicode_format_getnextarg(ctx);
14479 if (v == NULL)
14480 return -1;
14481 if (!PyLong_Check(v)) {
14482 PyErr_SetString(PyExc_TypeError,
14483 "* wants int");
14484 return -1;
14485 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014486 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014487 if (arg->prec == -1 && PyErr_Occurred())
14488 return -1;
14489 if (arg->prec < 0)
14490 arg->prec = 0;
14491 if (--ctx->fmtcnt >= 0) {
14492 arg->ch = FORMAT_READ(ctx);
14493 ctx->fmtpos++;
14494 }
14495 }
14496 else if (arg->ch >= '0' && arg->ch <= '9') {
14497 arg->prec = arg->ch - '0';
14498 while (--ctx->fmtcnt >= 0) {
14499 arg->ch = FORMAT_READ(ctx);
14500 ctx->fmtpos++;
14501 if (arg->ch < '0' || arg->ch > '9')
14502 break;
14503 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14504 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014505 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014506 return -1;
14507 }
14508 arg->prec = arg->prec*10 + (arg->ch - '0');
14509 }
14510 }
14511 }
14512
14513 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14514 if (ctx->fmtcnt >= 0) {
14515 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14516 if (--ctx->fmtcnt >= 0) {
14517 arg->ch = FORMAT_READ(ctx);
14518 ctx->fmtpos++;
14519 }
14520 }
14521 }
14522 if (ctx->fmtcnt < 0) {
14523 PyErr_SetString(PyExc_ValueError,
14524 "incomplete format");
14525 return -1;
14526 }
14527 return 0;
14528
14529#undef FORMAT_READ
14530}
14531
14532/* Format one argument. Supported conversion specifiers:
14533
14534 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014535 - "i", "d", "u": int or float
14536 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014537 - "e", "E", "f", "F", "g", "G": float
14538 - "c": int or str (1 character)
14539
Victor Stinner8dbd4212012-12-04 09:30:24 +010014540 When possible, the output is written directly into the Unicode writer
14541 (ctx->writer). A string is created when padding is required.
14542
Victor Stinnera47082312012-10-04 02:19:54 +020014543 Return 0 if the argument has been formatted into *p_str,
14544 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014545 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014546static int
14547unicode_format_arg_format(struct unicode_formatter_t *ctx,
14548 struct unicode_format_arg_t *arg,
14549 PyObject **p_str)
14550{
14551 PyObject *v;
14552 _PyUnicodeWriter *writer = &ctx->writer;
14553
14554 if (ctx->fmtcnt == 0)
14555 ctx->writer.overallocate = 0;
14556
14557 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014558 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014559 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014560 return 1;
14561 }
14562
14563 v = unicode_format_getnextarg(ctx);
14564 if (v == NULL)
14565 return -1;
14566
Victor Stinnera47082312012-10-04 02:19:54 +020014567
14568 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014569 case 's':
14570 case 'r':
14571 case 'a':
14572 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14573 /* Fast path */
14574 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14575 return -1;
14576 return 1;
14577 }
14578
14579 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14580 *p_str = v;
14581 Py_INCREF(*p_str);
14582 }
14583 else {
14584 if (arg->ch == 's')
14585 *p_str = PyObject_Str(v);
14586 else if (arg->ch == 'r')
14587 *p_str = PyObject_Repr(v);
14588 else
14589 *p_str = PyObject_ASCII(v);
14590 }
14591 break;
14592
14593 case 'i':
14594 case 'd':
14595 case 'u':
14596 case 'o':
14597 case 'x':
14598 case 'X':
14599 {
14600 int ret = mainformatlong(v, arg, p_str, writer);
14601 if (ret != 0)
14602 return ret;
14603 arg->sign = 1;
14604 break;
14605 }
14606
14607 case 'e':
14608 case 'E':
14609 case 'f':
14610 case 'F':
14611 case 'g':
14612 case 'G':
14613 if (arg->width == -1 && arg->prec == -1
14614 && !(arg->flags & (F_SIGN | F_BLANK)))
14615 {
14616 /* Fast path */
14617 if (formatfloat(v, arg, NULL, writer) == -1)
14618 return -1;
14619 return 1;
14620 }
14621
14622 arg->sign = 1;
14623 if (formatfloat(v, arg, p_str, NULL) == -1)
14624 return -1;
14625 break;
14626
14627 case 'c':
14628 {
14629 Py_UCS4 ch = formatchar(v);
14630 if (ch == (Py_UCS4) -1)
14631 return -1;
14632 if (arg->width == -1 && arg->prec == -1) {
14633 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014634 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014635 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014636 return 1;
14637 }
14638 *p_str = PyUnicode_FromOrdinal(ch);
14639 break;
14640 }
14641
14642 default:
14643 PyErr_Format(PyExc_ValueError,
14644 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014645 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014646 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14647 (int)arg->ch,
14648 ctx->fmtpos - 1);
14649 return -1;
14650 }
14651 if (*p_str == NULL)
14652 return -1;
14653 assert (PyUnicode_Check(*p_str));
14654 return 0;
14655}
14656
14657static int
14658unicode_format_arg_output(struct unicode_formatter_t *ctx,
14659 struct unicode_format_arg_t *arg,
14660 PyObject *str)
14661{
14662 Py_ssize_t len;
14663 enum PyUnicode_Kind kind;
14664 void *pbuf;
14665 Py_ssize_t pindex;
14666 Py_UCS4 signchar;
14667 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014668 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014669 Py_ssize_t sublen;
14670 _PyUnicodeWriter *writer = &ctx->writer;
14671 Py_UCS4 fill;
14672
14673 fill = ' ';
14674 if (arg->sign && arg->flags & F_ZERO)
14675 fill = '0';
14676
14677 if (PyUnicode_READY(str) == -1)
14678 return -1;
14679
14680 len = PyUnicode_GET_LENGTH(str);
14681 if ((arg->width == -1 || arg->width <= len)
14682 && (arg->prec == -1 || arg->prec >= len)
14683 && !(arg->flags & (F_SIGN | F_BLANK)))
14684 {
14685 /* Fast path */
14686 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14687 return -1;
14688 return 0;
14689 }
14690
14691 /* Truncate the string for "s", "r" and "a" formats
14692 if the precision is set */
14693 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14694 if (arg->prec >= 0 && len > arg->prec)
14695 len = arg->prec;
14696 }
14697
14698 /* Adjust sign and width */
14699 kind = PyUnicode_KIND(str);
14700 pbuf = PyUnicode_DATA(str);
14701 pindex = 0;
14702 signchar = '\0';
14703 if (arg->sign) {
14704 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14705 if (ch == '-' || ch == '+') {
14706 signchar = ch;
14707 len--;
14708 pindex++;
14709 }
14710 else if (arg->flags & F_SIGN)
14711 signchar = '+';
14712 else if (arg->flags & F_BLANK)
14713 signchar = ' ';
14714 else
14715 arg->sign = 0;
14716 }
14717 if (arg->width < len)
14718 arg->width = len;
14719
14720 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014721 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014722 if (!(arg->flags & F_LJUST)) {
14723 if (arg->sign) {
14724 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014725 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014726 }
14727 else {
14728 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014729 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014730 }
14731 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014732 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14733 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014734 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014735 }
14736
Victor Stinnera47082312012-10-04 02:19:54 +020014737 buflen = arg->width;
14738 if (arg->sign && len == arg->width)
14739 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014740 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014741 return -1;
14742
14743 /* Write the sign if needed */
14744 if (arg->sign) {
14745 if (fill != ' ') {
14746 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14747 writer->pos += 1;
14748 }
14749 if (arg->width > len)
14750 arg->width--;
14751 }
14752
14753 /* Write the numeric prefix for "x", "X" and "o" formats
14754 if the alternate form is used.
14755 For example, write "0x" for the "%#x" format. */
14756 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14757 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14758 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14759 if (fill != ' ') {
14760 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14761 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14762 writer->pos += 2;
14763 pindex += 2;
14764 }
14765 arg->width -= 2;
14766 if (arg->width < 0)
14767 arg->width = 0;
14768 len -= 2;
14769 }
14770
14771 /* Pad left with the fill character if needed */
14772 if (arg->width > len && !(arg->flags & F_LJUST)) {
14773 sublen = arg->width - len;
14774 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14775 writer->pos += sublen;
14776 arg->width = len;
14777 }
14778
14779 /* If padding with spaces: write sign if needed and/or numeric prefix if
14780 the alternate form is used */
14781 if (fill == ' ') {
14782 if (arg->sign) {
14783 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14784 writer->pos += 1;
14785 }
14786 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14787 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14788 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14789 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14790 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14791 writer->pos += 2;
14792 pindex += 2;
14793 }
14794 }
14795
14796 /* Write characters */
14797 if (len) {
14798 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14799 str, pindex, len);
14800 writer->pos += len;
14801 }
14802
14803 /* Pad right with the fill character if needed */
14804 if (arg->width > len) {
14805 sublen = arg->width - len;
14806 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14807 writer->pos += sublen;
14808 }
14809 return 0;
14810}
14811
14812/* Helper of PyUnicode_Format(): format one arg.
14813 Return 0 on success, raise an exception and return -1 on error. */
14814static int
14815unicode_format_arg(struct unicode_formatter_t *ctx)
14816{
14817 struct unicode_format_arg_t arg;
14818 PyObject *str;
14819 int ret;
14820
Victor Stinner8dbd4212012-12-04 09:30:24 +010014821 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14822 arg.flags = 0;
14823 arg.width = -1;
14824 arg.prec = -1;
14825 arg.sign = 0;
14826 str = NULL;
14827
Victor Stinnera47082312012-10-04 02:19:54 +020014828 ret = unicode_format_arg_parse(ctx, &arg);
14829 if (ret == -1)
14830 return -1;
14831
14832 ret = unicode_format_arg_format(ctx, &arg, &str);
14833 if (ret == -1)
14834 return -1;
14835
14836 if (ret != 1) {
14837 ret = unicode_format_arg_output(ctx, &arg, str);
14838 Py_DECREF(str);
14839 if (ret == -1)
14840 return -1;
14841 }
14842
14843 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14844 PyErr_SetString(PyExc_TypeError,
14845 "not all arguments converted during string formatting");
14846 return -1;
14847 }
14848 return 0;
14849}
14850
Alexander Belopolsky40018472011-02-26 01:02:56 +000014851PyObject *
14852PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014853{
Victor Stinnera47082312012-10-04 02:19:54 +020014854 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014855
Guido van Rossumd57fd912000-03-10 22:53:23 +000014856 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014857 PyErr_BadInternalCall();
14858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014859 }
Victor Stinnera47082312012-10-04 02:19:54 +020014860
14861 ctx.fmtstr = PyUnicode_FromObject(format);
14862 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014863 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014864 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14865 Py_DECREF(ctx.fmtstr);
14866 return NULL;
14867 }
14868 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14869 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14870 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14871 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014872
Victor Stinner8f674cc2013-04-17 23:02:17 +020014873 _PyUnicodeWriter_Init(&ctx.writer);
14874 ctx.writer.min_length = ctx.fmtcnt + 100;
14875 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014876
Guido van Rossumd57fd912000-03-10 22:53:23 +000014877 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014878 ctx.arglen = PyTuple_Size(args);
14879 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014880 }
14881 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014882 ctx.arglen = -1;
14883 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884 }
Victor Stinnera47082312012-10-04 02:19:54 +020014885 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014886 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014887 ctx.dict = args;
14888 else
14889 ctx.dict = NULL;
14890 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014891
Victor Stinnera47082312012-10-04 02:19:54 +020014892 while (--ctx.fmtcnt >= 0) {
14893 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014894 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014895
14896 nonfmtpos = ctx.fmtpos++;
14897 while (ctx.fmtcnt >= 0 &&
14898 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14899 ctx.fmtpos++;
14900 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014901 }
Victor Stinnera47082312012-10-04 02:19:54 +020014902 if (ctx.fmtcnt < 0) {
14903 ctx.fmtpos--;
14904 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014905 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014906
Victor Stinnercfc4c132013-04-03 01:48:39 +020014907 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14908 nonfmtpos, ctx.fmtpos) < 0)
14909 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014910 }
14911 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014912 ctx.fmtpos++;
14913 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014914 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014915 }
14916 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014917
Victor Stinnera47082312012-10-04 02:19:54 +020014918 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014919 PyErr_SetString(PyExc_TypeError,
14920 "not all arguments converted during string formatting");
14921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922 }
14923
Victor Stinnera47082312012-10-04 02:19:54 +020014924 if (ctx.args_owned) {
14925 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014926 }
Victor Stinnera47082312012-10-04 02:19:54 +020014927 Py_DECREF(ctx.fmtstr);
14928 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014929
Benjamin Peterson29060642009-01-31 22:14:21 +000014930 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014931 Py_DECREF(ctx.fmtstr);
14932 _PyUnicodeWriter_Dealloc(&ctx.writer);
14933 if (ctx.args_owned) {
14934 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014935 }
14936 return NULL;
14937}
14938
Jeremy Hylton938ace62002-07-17 16:30:39 +000014939static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014940unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14941
Tim Peters6d6c1a32001-08-02 04:15:00 +000014942static PyObject *
14943unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14944{
Benjamin Peterson29060642009-01-31 22:14:21 +000014945 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014946 static char *kwlist[] = {"object", "encoding", "errors", 0};
14947 char *encoding = NULL;
14948 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014949
Benjamin Peterson14339b62009-01-31 16:36:08 +000014950 if (type != &PyUnicode_Type)
14951 return unicode_subtype_new(type, args, kwds);
14952 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014953 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014954 return NULL;
14955 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014956 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014957 if (encoding == NULL && errors == NULL)
14958 return PyObject_Str(x);
14959 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014960 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014961}
14962
Guido van Rossume023fe02001-08-30 03:12:59 +000014963static PyObject *
14964unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14965{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014966 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014967 Py_ssize_t length, char_size;
14968 int share_wstr, share_utf8;
14969 unsigned int kind;
14970 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014971
Benjamin Peterson14339b62009-01-31 16:36:08 +000014972 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014973
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014974 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014975 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014977 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014978 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014979 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014980 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014981 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014982
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014983 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014984 if (self == NULL) {
14985 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 return NULL;
14987 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014988 kind = PyUnicode_KIND(unicode);
14989 length = PyUnicode_GET_LENGTH(unicode);
14990
14991 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014992#ifdef Py_DEBUG
14993 _PyUnicode_HASH(self) = -1;
14994#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014995 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014996#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014997 _PyUnicode_STATE(self).interned = 0;
14998 _PyUnicode_STATE(self).kind = kind;
14999 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015000 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015001 _PyUnicode_STATE(self).ready = 1;
15002 _PyUnicode_WSTR(self) = NULL;
15003 _PyUnicode_UTF8_LENGTH(self) = 0;
15004 _PyUnicode_UTF8(self) = NULL;
15005 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015006 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015007
15008 share_utf8 = 0;
15009 share_wstr = 0;
15010 if (kind == PyUnicode_1BYTE_KIND) {
15011 char_size = 1;
15012 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15013 share_utf8 = 1;
15014 }
15015 else if (kind == PyUnicode_2BYTE_KIND) {
15016 char_size = 2;
15017 if (sizeof(wchar_t) == 2)
15018 share_wstr = 1;
15019 }
15020 else {
15021 assert(kind == PyUnicode_4BYTE_KIND);
15022 char_size = 4;
15023 if (sizeof(wchar_t) == 4)
15024 share_wstr = 1;
15025 }
15026
15027 /* Ensure we won't overflow the length. */
15028 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15029 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015030 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015032 data = PyObject_MALLOC((length + 1) * char_size);
15033 if (data == NULL) {
15034 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015035 goto onError;
15036 }
15037
Victor Stinnerc3c74152011-10-02 20:39:55 +020015038 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015039 if (share_utf8) {
15040 _PyUnicode_UTF8_LENGTH(self) = length;
15041 _PyUnicode_UTF8(self) = data;
15042 }
15043 if (share_wstr) {
15044 _PyUnicode_WSTR_LENGTH(self) = length;
15045 _PyUnicode_WSTR(self) = (wchar_t *)data;
15046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015047
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015048 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015049 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015050 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015051#ifdef Py_DEBUG
15052 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15053#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015054 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015055 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056
15057onError:
15058 Py_DECREF(unicode);
15059 Py_DECREF(self);
15060 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015061}
15062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015063PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015064"str(object='') -> str\n\
15065str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015066\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015067Create a new string object from the given object. If encoding or\n\
15068errors is specified, then the object must expose a data buffer\n\
15069that will be decoded using the given encoding and error handler.\n\
15070Otherwise, returns the result of object.__str__() (if defined)\n\
15071or repr(object).\n\
15072encoding defaults to sys.getdefaultencoding().\n\
15073errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015074
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015075static PyObject *unicode_iter(PyObject *seq);
15076
Guido van Rossumd57fd912000-03-10 22:53:23 +000015077PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015078 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015079 "str", /* tp_name */
15080 sizeof(PyUnicodeObject), /* tp_size */
15081 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015082 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 (destructor)unicode_dealloc, /* tp_dealloc */
15084 0, /* tp_print */
15085 0, /* tp_getattr */
15086 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015087 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 unicode_repr, /* tp_repr */
15089 &unicode_as_number, /* tp_as_number */
15090 &unicode_as_sequence, /* tp_as_sequence */
15091 &unicode_as_mapping, /* tp_as_mapping */
15092 (hashfunc) unicode_hash, /* tp_hash*/
15093 0, /* tp_call*/
15094 (reprfunc) unicode_str, /* tp_str */
15095 PyObject_GenericGetAttr, /* tp_getattro */
15096 0, /* tp_setattro */
15097 0, /* tp_as_buffer */
15098 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015099 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 unicode_doc, /* tp_doc */
15101 0, /* tp_traverse */
15102 0, /* tp_clear */
15103 PyUnicode_RichCompare, /* tp_richcompare */
15104 0, /* tp_weaklistoffset */
15105 unicode_iter, /* tp_iter */
15106 0, /* tp_iternext */
15107 unicode_methods, /* tp_methods */
15108 0, /* tp_members */
15109 0, /* tp_getset */
15110 &PyBaseObject_Type, /* tp_base */
15111 0, /* tp_dict */
15112 0, /* tp_descr_get */
15113 0, /* tp_descr_set */
15114 0, /* tp_dictoffset */
15115 0, /* tp_init */
15116 0, /* tp_alloc */
15117 unicode_new, /* tp_new */
15118 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015119};
15120
15121/* Initialize the Unicode implementation */
15122
Victor Stinner3a50e702011-10-18 21:21:00 +020015123int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015124{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015125 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015126 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015127 0x000A, /* LINE FEED */
15128 0x000D, /* CARRIAGE RETURN */
15129 0x001C, /* FILE SEPARATOR */
15130 0x001D, /* GROUP SEPARATOR */
15131 0x001E, /* RECORD SEPARATOR */
15132 0x0085, /* NEXT LINE */
15133 0x2028, /* LINE SEPARATOR */
15134 0x2029, /* PARAGRAPH SEPARATOR */
15135 };
15136
Fred Drakee4315f52000-05-09 19:53:39 +000015137 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015138 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015139 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015140 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015141 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015142
Guido van Rossumcacfc072002-05-24 19:01:59 +000015143 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015144 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015145
15146 /* initialize the linebreak bloom filter */
15147 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015148 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015149 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015150
Christian Heimes26532f72013-07-20 14:57:16 +020015151 if (PyType_Ready(&EncodingMapType) < 0)
15152 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015153
Benjamin Petersonc4311282012-10-30 23:21:10 -040015154 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15155 Py_FatalError("Can't initialize field name iterator type");
15156
15157 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15158 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015159
Victor Stinner3a50e702011-10-18 21:21:00 +020015160 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015161}
15162
15163/* Finalize the Unicode implementation */
15164
Christian Heimesa156e092008-02-16 07:38:31 +000015165int
15166PyUnicode_ClearFreeList(void)
15167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015168 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015169}
15170
Guido van Rossumd57fd912000-03-10 22:53:23 +000015171void
Thomas Wouters78890102000-07-22 19:25:51 +000015172_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015173{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015174 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015175
Serhiy Storchaka05997252013-01-26 12:14:02 +020015176 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015177
Serhiy Storchaka05997252013-01-26 12:14:02 +020015178 for (i = 0; i < 256; i++)
15179 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015180 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015181 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015182}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015183
Walter Dörwald16807132007-05-25 13:52:07 +000015184void
15185PyUnicode_InternInPlace(PyObject **p)
15186{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015187 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015189#ifdef Py_DEBUG
15190 assert(s != NULL);
15191 assert(_PyUnicode_CHECK(s));
15192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015194 return;
15195#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 /* If it's a subclass, we don't really know what putting
15197 it in the interned dict might do. */
15198 if (!PyUnicode_CheckExact(s))
15199 return;
15200 if (PyUnicode_CHECK_INTERNED(s))
15201 return;
15202 if (interned == NULL) {
15203 interned = PyDict_New();
15204 if (interned == NULL) {
15205 PyErr_Clear(); /* Don't leave an exception */
15206 return;
15207 }
15208 }
15209 /* It might be that the GetItem call fails even
15210 though the key is present in the dictionary,
15211 namely when this happens during a stack overflow. */
15212 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015213 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015215
Victor Stinnerf0335102013-04-14 19:13:03 +020015216 if (t) {
15217 Py_INCREF(t);
15218 Py_DECREF(*p);
15219 *p = t;
15220 return;
15221 }
Walter Dörwald16807132007-05-25 13:52:07 +000015222
Benjamin Peterson14339b62009-01-31 16:36:08 +000015223 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015224 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015225 PyErr_Clear();
15226 PyThreadState_GET()->recursion_critical = 0;
15227 return;
15228 }
15229 PyThreadState_GET()->recursion_critical = 0;
15230 /* The two references in interned are not counted by refcnt.
15231 The deallocator will take care of this */
15232 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015233 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015234}
15235
15236void
15237PyUnicode_InternImmortal(PyObject **p)
15238{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 PyUnicode_InternInPlace(p);
15240 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015241 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015242 Py_INCREF(*p);
15243 }
Walter Dörwald16807132007-05-25 13:52:07 +000015244}
15245
15246PyObject *
15247PyUnicode_InternFromString(const char *cp)
15248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 PyObject *s = PyUnicode_FromString(cp);
15250 if (s == NULL)
15251 return NULL;
15252 PyUnicode_InternInPlace(&s);
15253 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015254}
15255
Alexander Belopolsky40018472011-02-26 01:02:56 +000015256void
15257_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015260 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 Py_ssize_t i, n;
15262 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015263
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 if (interned == NULL || !PyDict_Check(interned))
15265 return;
15266 keys = PyDict_Keys(interned);
15267 if (keys == NULL || !PyList_Check(keys)) {
15268 PyErr_Clear();
15269 return;
15270 }
Walter Dörwald16807132007-05-25 13:52:07 +000015271
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15273 detector, interned unicode strings are not forcibly deallocated;
15274 rather, we give them their stolen references back, and then clear
15275 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015276
Benjamin Peterson14339b62009-01-31 16:36:08 +000015277 n = PyList_GET_SIZE(keys);
15278 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015279 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015281 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015282 if (PyUnicode_READY(s) == -1) {
15283 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015284 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015286 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015287 case SSTATE_NOT_INTERNED:
15288 /* XXX Shouldn't happen */
15289 break;
15290 case SSTATE_INTERNED_IMMORTAL:
15291 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015292 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 break;
15294 case SSTATE_INTERNED_MORTAL:
15295 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015296 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 break;
15298 default:
15299 Py_FatalError("Inconsistent interned string state.");
15300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015301 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 }
15303 fprintf(stderr, "total size of all interned strings: "
15304 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15305 "mortal/immortal\n", mortal_size, immortal_size);
15306 Py_DECREF(keys);
15307 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015308 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015309}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015310
15311
15312/********************* Unicode Iterator **************************/
15313
15314typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 PyObject_HEAD
15316 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015317 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015318} unicodeiterobject;
15319
15320static void
15321unicodeiter_dealloc(unicodeiterobject *it)
15322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 _PyObject_GC_UNTRACK(it);
15324 Py_XDECREF(it->it_seq);
15325 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015326}
15327
15328static int
15329unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 Py_VISIT(it->it_seq);
15332 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015333}
15334
15335static PyObject *
15336unicodeiter_next(unicodeiterobject *it)
15337{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015338 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015339
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 assert(it != NULL);
15341 seq = it->it_seq;
15342 if (seq == NULL)
15343 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015344 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015346 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15347 int kind = PyUnicode_KIND(seq);
15348 void *data = PyUnicode_DATA(seq);
15349 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15350 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 if (item != NULL)
15352 ++it->it_index;
15353 return item;
15354 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015355
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 Py_DECREF(seq);
15357 it->it_seq = NULL;
15358 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015359}
15360
15361static PyObject *
15362unicodeiter_len(unicodeiterobject *it)
15363{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 Py_ssize_t len = 0;
15365 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015366 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368}
15369
15370PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15371
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015372static PyObject *
15373unicodeiter_reduce(unicodeiterobject *it)
15374{
15375 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015376 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015377 it->it_seq, it->it_index);
15378 } else {
15379 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15380 if (u == NULL)
15381 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015382 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015383 }
15384}
15385
15386PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15387
15388static PyObject *
15389unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15390{
15391 Py_ssize_t index = PyLong_AsSsize_t(state);
15392 if (index == -1 && PyErr_Occurred())
15393 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015394 if (it->it_seq != NULL) {
15395 if (index < 0)
15396 index = 0;
15397 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15398 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15399 it->it_index = index;
15400 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015401 Py_RETURN_NONE;
15402}
15403
15404PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15405
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015406static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015408 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015409 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15410 reduce_doc},
15411 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15412 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015414};
15415
15416PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15418 "str_iterator", /* tp_name */
15419 sizeof(unicodeiterobject), /* tp_basicsize */
15420 0, /* tp_itemsize */
15421 /* methods */
15422 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15423 0, /* tp_print */
15424 0, /* tp_getattr */
15425 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015426 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 0, /* tp_repr */
15428 0, /* tp_as_number */
15429 0, /* tp_as_sequence */
15430 0, /* tp_as_mapping */
15431 0, /* tp_hash */
15432 0, /* tp_call */
15433 0, /* tp_str */
15434 PyObject_GenericGetAttr, /* tp_getattro */
15435 0, /* tp_setattro */
15436 0, /* tp_as_buffer */
15437 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15438 0, /* tp_doc */
15439 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15440 0, /* tp_clear */
15441 0, /* tp_richcompare */
15442 0, /* tp_weaklistoffset */
15443 PyObject_SelfIter, /* tp_iter */
15444 (iternextfunc)unicodeiter_next, /* tp_iternext */
15445 unicodeiter_methods, /* tp_methods */
15446 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015447};
15448
15449static PyObject *
15450unicode_iter(PyObject *seq)
15451{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015453
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 if (!PyUnicode_Check(seq)) {
15455 PyErr_BadInternalCall();
15456 return NULL;
15457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015458 if (PyUnicode_READY(seq) == -1)
15459 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15461 if (it == NULL)
15462 return NULL;
15463 it->it_index = 0;
15464 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015465 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015466 _PyObject_GC_TRACK(it);
15467 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015468}
15469
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015470
15471size_t
15472Py_UNICODE_strlen(const Py_UNICODE *u)
15473{
15474 int res = 0;
15475 while(*u++)
15476 res++;
15477 return res;
15478}
15479
15480Py_UNICODE*
15481Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15482{
15483 Py_UNICODE *u = s1;
15484 while ((*u++ = *s2++));
15485 return s1;
15486}
15487
15488Py_UNICODE*
15489Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15490{
15491 Py_UNICODE *u = s1;
15492 while ((*u++ = *s2++))
15493 if (n-- == 0)
15494 break;
15495 return s1;
15496}
15497
15498Py_UNICODE*
15499Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15500{
15501 Py_UNICODE *u1 = s1;
15502 u1 += Py_UNICODE_strlen(u1);
15503 Py_UNICODE_strcpy(u1, s2);
15504 return s1;
15505}
15506
15507int
15508Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15509{
15510 while (*s1 && *s2 && *s1 == *s2)
15511 s1++, s2++;
15512 if (*s1 && *s2)
15513 return (*s1 < *s2) ? -1 : +1;
15514 if (*s1)
15515 return 1;
15516 if (*s2)
15517 return -1;
15518 return 0;
15519}
15520
15521int
15522Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15523{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015524 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015525 for (; n != 0; n--) {
15526 u1 = *s1;
15527 u2 = *s2;
15528 if (u1 != u2)
15529 return (u1 < u2) ? -1 : +1;
15530 if (u1 == '\0')
15531 return 0;
15532 s1++;
15533 s2++;
15534 }
15535 return 0;
15536}
15537
15538Py_UNICODE*
15539Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15540{
15541 const Py_UNICODE *p;
15542 for (p = s; *p; p++)
15543 if (*p == c)
15544 return (Py_UNICODE*)p;
15545 return NULL;
15546}
15547
15548Py_UNICODE*
15549Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15550{
15551 const Py_UNICODE *p;
15552 p = s + Py_UNICODE_strlen(s);
15553 while (p != s) {
15554 p--;
15555 if (*p == c)
15556 return (Py_UNICODE*)p;
15557 }
15558 return NULL;
15559}
Victor Stinner331ea922010-08-10 16:37:20 +000015560
Victor Stinner71133ff2010-09-01 23:43:53 +000015561Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015562PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015563{
Victor Stinner577db2c2011-10-11 22:12:48 +020015564 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015565 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015567 if (!PyUnicode_Check(unicode)) {
15568 PyErr_BadArgument();
15569 return NULL;
15570 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015571 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015572 if (u == NULL)
15573 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015574 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015575 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015576 PyErr_NoMemory();
15577 return NULL;
15578 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015579 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015580 size *= sizeof(Py_UNICODE);
15581 copy = PyMem_Malloc(size);
15582 if (copy == NULL) {
15583 PyErr_NoMemory();
15584 return NULL;
15585 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015586 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015587 return copy;
15588}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015589
Georg Brandl66c221e2010-10-14 07:04:07 +000015590/* A _string module, to export formatter_parser and formatter_field_name_split
15591 to the string.Formatter class implemented in Python. */
15592
15593static PyMethodDef _string_methods[] = {
15594 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15595 METH_O, PyDoc_STR("split the argument as a field name")},
15596 {"formatter_parser", (PyCFunction) formatter_parser,
15597 METH_O, PyDoc_STR("parse the argument as a format string")},
15598 {NULL, NULL}
15599};
15600
15601static struct PyModuleDef _string_module = {
15602 PyModuleDef_HEAD_INIT,
15603 "_string",
15604 PyDoc_STR("string helper module"),
15605 0,
15606 _string_methods,
15607 NULL,
15608 NULL,
15609 NULL,
15610 NULL
15611};
15612
15613PyMODINIT_FUNC
15614PyInit__string(void)
15615{
15616 return PyModule_Create(&_string_module);
15617}
15618
15619
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015620#ifdef __cplusplus
15621}
15622#endif