blob: 18a30e207ad6b99dbac1510181848475455829ac [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
275static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
321 if (strcmp(errors, "surrogateescape") == 0)
322 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner50149202015-09-22 00:26:54 +0200323 if (strcmp(errors, "replace") == 0)
324 return _Py_ERROR_REPLACE;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200325 if (strcmp(errors, "ignore") == 0)
326 return _Py_ERROR_IGNORE;
327 if (strcmp(errors, "backslashreplace") == 0)
328 return _Py_ERROR_BACKSLASHREPLACE;
329 if (strcmp(errors, "surrogatepass") == 0)
330 return _Py_ERROR_SURROGATEPASS;
Victor Stinner50149202015-09-22 00:26:54 +0200331 if (strcmp(errors, "xmlcharrefreplace") == 0)
332 return _Py_ERROR_XMLCHARREFREPLACE;
333 return _Py_ERROR_OTHER;
334}
335
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300336/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
337 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000338Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000339PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000340{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000341#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000343#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 /* This is actually an illegal character, so it should
345 not be passed to unichr. */
346 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347#endif
348}
349
Victor Stinner910337b2011-10-03 03:20:16 +0200350#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200351int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100352_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200353{
354 PyASCIIObject *ascii;
355 unsigned int kind;
356
357 assert(PyUnicode_Check(op));
358
359 ascii = (PyASCIIObject *)op;
360 kind = ascii->state.kind;
361
Victor Stinnera3b334d2011-10-03 13:53:37 +0200362 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200363 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200364 assert(ascii->state.ready == 1);
365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200367 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200369
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 if (ascii->state.compact == 1) {
371 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200372 assert(kind == PyUnicode_1BYTE_KIND
373 || kind == PyUnicode_2BYTE_KIND
374 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100378 }
379 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
381
382 data = unicode->data.any;
383 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100384 assert(ascii->length == 0);
385 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 assert(ascii->state.compact == 0);
387 assert(ascii->state.ascii == 0);
388 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100389 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 assert(ascii->wstr != NULL);
391 assert(data == NULL);
392 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 }
394 else {
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ready == 1);
400 assert(data != NULL);
401 if (ascii->state.ascii) {
402 assert (compact->utf8 == data);
403 assert (compact->utf8_length == ascii->length);
404 }
405 else
406 assert (compact->utf8 != data);
407 }
408 }
409 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200410 if (
411#if SIZEOF_WCHAR_T == 2
412 kind == PyUnicode_2BYTE_KIND
413#else
414 kind == PyUnicode_4BYTE_KIND
415#endif
416 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 {
418 assert(ascii->wstr == data);
419 assert(compact->wstr_length == ascii->length);
420 } else
421 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423
424 if (compact->utf8 == NULL)
425 assert(compact->utf8_length == 0);
426 if (ascii->wstr == NULL)
427 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200429 /* check that the best kind is used */
430 if (check_content && kind != PyUnicode_WCHAR_KIND)
431 {
432 Py_ssize_t i;
433 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200434 void *data;
435 Py_UCS4 ch;
436
437 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200438 for (i=0; i < ascii->length; i++)
439 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200440 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 if (ch > maxchar)
442 maxchar = ch;
443 }
444 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100445 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200446 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100447 assert(maxchar <= 255);
448 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200449 else
450 assert(maxchar < 128);
451 }
Victor Stinner77faf692011-11-20 18:56:05 +0100452 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 0xFFFF);
455 }
456 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100458 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200460 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400462 return 1;
463}
Victor Stinner910337b2011-10-03 03:20:16 +0200464#endif
465
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466static PyObject*
467unicode_result_wchar(PyObject *unicode)
468{
469#ifndef Py_DEBUG
470 Py_ssize_t len;
471
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 len = _PyUnicode_WSTR_LENGTH(unicode);
473 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200475 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 }
477
478 if (len == 1) {
479 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100480 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
482 Py_DECREF(unicode);
483 return latin1_char;
484 }
485 }
486
487 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200488 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 return NULL;
490 }
491#else
Victor Stinneraa771272012-10-04 02:32:58 +0200492 assert(Py_REFCNT(unicode) == 1);
493
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 /* don't make the result ready in debug mode to ensure that the caller
495 makes the string ready before using it */
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497#endif
498 return unicode;
499}
500
501static PyObject*
502unicode_result_ready(PyObject *unicode)
503{
504 Py_ssize_t length;
505
506 length = PyUnicode_GET_LENGTH(unicode);
507 if (length == 0) {
508 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100509 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 }
512 return unicode_empty;
513 }
514
515 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200516 void *data = PyUnicode_DATA(unicode);
517 int kind = PyUnicode_KIND(unicode);
518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519 if (ch < 256) {
520 PyObject *latin1_char = unicode_latin1[ch];
521 if (latin1_char != NULL) {
522 if (unicode != latin1_char) {
523 Py_INCREF(latin1_char);
524 Py_DECREF(unicode);
525 }
526 return latin1_char;
527 }
528 else {
529 assert(_PyUnicode_CheckConsistency(unicode, 1));
530 Py_INCREF(unicode);
531 unicode_latin1[ch] = unicode;
532 return unicode;
533 }
534 }
535 }
536
537 assert(_PyUnicode_CheckConsistency(unicode, 1));
538 return unicode;
539}
540
541static PyObject*
542unicode_result(PyObject *unicode)
543{
544 assert(_PyUnicode_CHECK(unicode));
545 if (PyUnicode_IS_READY(unicode))
546 return unicode_result_ready(unicode);
547 else
548 return unicode_result_wchar(unicode);
549}
550
Victor Stinnerc4b49542011-12-11 22:44:26 +0100551static PyObject*
552unicode_result_unchanged(PyObject *unicode)
553{
554 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500555 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100556 return NULL;
557 Py_INCREF(unicode);
558 return unicode;
559 }
560 else
561 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100562 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563}
564
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200565/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
566 ASCII, Latin1, UTF-8, etc. */
567static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200568backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200569 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
570{
Victor Stinnerad771582015-10-09 12:38:53 +0200571 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572 Py_UCS4 ch;
573 enum PyUnicode_Kind kind;
574 void *data;
575
576 assert(PyUnicode_IS_READY(unicode));
577 kind = PyUnicode_KIND(unicode);
578 data = PyUnicode_DATA(unicode);
579
580 size = 0;
581 /* determine replacement size */
582 for (i = collstart; i < collend; ++i) {
583 Py_ssize_t incr;
584
585 ch = PyUnicode_READ(kind, data, i);
586 if (ch < 0x100)
587 incr = 2+2;
588 else if (ch < 0x10000)
589 incr = 2+4;
590 else {
591 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200592 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 }
594 if (size > PY_SSIZE_T_MAX - incr) {
595 PyErr_SetString(PyExc_OverflowError,
596 "encoded result is too long for a Python string");
597 return NULL;
598 }
599 size += incr;
600 }
601
Victor Stinnerad771582015-10-09 12:38:53 +0200602 str = _PyBytesWriter_Prepare(writer, str, size);
603 if (str == NULL)
604 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200605
606 /* generate replacement */
607 for (i = collstart; i < collend; ++i) {
608 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200609 *str++ = '\\';
610 if (ch >= 0x00010000) {
611 *str++ = 'U';
612 *str++ = Py_hexdigits[(ch>>28)&0xf];
613 *str++ = Py_hexdigits[(ch>>24)&0xf];
614 *str++ = Py_hexdigits[(ch>>20)&0xf];
615 *str++ = Py_hexdigits[(ch>>16)&0xf];
616 *str++ = Py_hexdigits[(ch>>12)&0xf];
617 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200618 }
Victor Stinner797485e2015-10-09 03:17:30 +0200619 else if (ch >= 0x100) {
620 *str++ = 'u';
621 *str++ = Py_hexdigits[(ch>>12)&0xf];
622 *str++ = Py_hexdigits[(ch>>8)&0xf];
623 }
624 else
625 *str++ = 'x';
626 *str++ = Py_hexdigits[(ch>>4)&0xf];
627 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200628 }
629 return str;
630}
631
632/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
633 ASCII, Latin1, UTF-8, etc. */
634static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200635xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200636 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
637{
Victor Stinnerad771582015-10-09 12:38:53 +0200638 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200639 Py_UCS4 ch;
640 enum PyUnicode_Kind kind;
641 void *data;
642
643 assert(PyUnicode_IS_READY(unicode));
644 kind = PyUnicode_KIND(unicode);
645 data = PyUnicode_DATA(unicode);
646
647 size = 0;
648 /* determine replacement size */
649 for (i = collstart; i < collend; ++i) {
650 Py_ssize_t incr;
651
652 ch = PyUnicode_READ(kind, data, i);
653 if (ch < 10)
654 incr = 2+1+1;
655 else if (ch < 100)
656 incr = 2+2+1;
657 else if (ch < 1000)
658 incr = 2+3+1;
659 else if (ch < 10000)
660 incr = 2+4+1;
661 else if (ch < 100000)
662 incr = 2+5+1;
663 else if (ch < 1000000)
664 incr = 2+6+1;
665 else {
666 assert(ch <= MAX_UNICODE);
667 incr = 2+7+1;
668 }
669 if (size > PY_SSIZE_T_MAX - incr) {
670 PyErr_SetString(PyExc_OverflowError,
671 "encoded result is too long for a Python string");
672 return NULL;
673 }
674 size += incr;
675 }
676
Victor Stinnerad771582015-10-09 12:38:53 +0200677 str = _PyBytesWriter_Prepare(writer, str, size);
678 if (str == NULL)
679 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200680
681 /* generate replacement */
682 for (i = collstart; i < collend; ++i) {
683 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
684 }
685 return str;
686}
687
Thomas Wouters477c8d52006-05-27 19:21:47 +0000688/* --- Bloom Filters ----------------------------------------------------- */
689
690/* stuff to implement simple "bloom filters" for Unicode characters.
691 to keep things simple, we use a single bitmask, using the least 5
692 bits from each unicode characters as the bit index. */
693
694/* the linebreak mask is set up by Unicode_Init below */
695
Antoine Pitrouf068f942010-01-13 14:19:12 +0000696#if LONG_BIT >= 128
697#define BLOOM_WIDTH 128
698#elif LONG_BIT >= 64
699#define BLOOM_WIDTH 64
700#elif LONG_BIT >= 32
701#define BLOOM_WIDTH 32
702#else
703#error "LONG_BIT is smaller than 32"
704#endif
705
Thomas Wouters477c8d52006-05-27 19:21:47 +0000706#define BLOOM_MASK unsigned long
707
Serhiy Storchaka05997252013-01-26 12:14:02 +0200708static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000709
Antoine Pitrouf068f942010-01-13 14:19:12 +0000710#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000711
Benjamin Peterson29060642009-01-31 22:14:21 +0000712#define BLOOM_LINEBREAK(ch) \
713 ((ch) < 128U ? ascii_linebreak[(ch)] : \
714 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000715
Alexander Belopolsky40018472011-02-26 01:02:56 +0000716Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718{
Victor Stinnera85af502013-04-09 21:53:54 +0200719#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
720 do { \
721 TYPE *data = (TYPE *)PTR; \
722 TYPE *end = data + LEN; \
723 Py_UCS4 ch; \
724 for (; data != end; data++) { \
725 ch = *data; \
726 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
727 } \
728 break; \
729 } while (0)
730
Thomas Wouters477c8d52006-05-27 19:21:47 +0000731 /* calculate simple bloom-style bitmask for a given unicode string */
732
Antoine Pitrouf068f942010-01-13 14:19:12 +0000733 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734
735 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200736 switch (kind) {
737 case PyUnicode_1BYTE_KIND:
738 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
739 break;
740 case PyUnicode_2BYTE_KIND:
741 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
742 break;
743 case PyUnicode_4BYTE_KIND:
744 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
745 break;
746 default:
747 assert(0);
748 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000749 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200750
751#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000752}
753
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200754/* Compilation of templated routines */
755
756#include "stringlib/asciilib.h"
757#include "stringlib/fastsearch.h"
758#include "stringlib/partition.h"
759#include "stringlib/split.h"
760#include "stringlib/count.h"
761#include "stringlib/find.h"
762#include "stringlib/find_max_char.h"
763#include "stringlib/localeutil.h"
764#include "stringlib/undef.h"
765
766#include "stringlib/ucs1lib.h"
767#include "stringlib/fastsearch.h"
768#include "stringlib/partition.h"
769#include "stringlib/split.h"
770#include "stringlib/count.h"
771#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300772#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773#include "stringlib/find_max_char.h"
774#include "stringlib/localeutil.h"
775#include "stringlib/undef.h"
776
777#include "stringlib/ucs2lib.h"
778#include "stringlib/fastsearch.h"
779#include "stringlib/partition.h"
780#include "stringlib/split.h"
781#include "stringlib/count.h"
782#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300783#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200784#include "stringlib/find_max_char.h"
785#include "stringlib/localeutil.h"
786#include "stringlib/undef.h"
787
788#include "stringlib/ucs4lib.h"
789#include "stringlib/fastsearch.h"
790#include "stringlib/partition.h"
791#include "stringlib/split.h"
792#include "stringlib/count.h"
793#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300794#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200795#include "stringlib/find_max_char.h"
796#include "stringlib/localeutil.h"
797#include "stringlib/undef.h"
798
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200799#include "stringlib/unicodedefs.h"
800#include "stringlib/fastsearch.h"
801#include "stringlib/count.h"
802#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100803#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200804
Guido van Rossumd57fd912000-03-10 22:53:23 +0000805/* --- Unicode Object ----------------------------------------------------- */
806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200808fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200810Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200811 Py_ssize_t size, Py_UCS4 ch,
812 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200814 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
815
816 switch (kind) {
817 case PyUnicode_1BYTE_KIND:
818 {
819 Py_UCS1 ch1 = (Py_UCS1) ch;
820 if (ch1 == ch)
821 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
822 else
823 return -1;
824 }
825 case PyUnicode_2BYTE_KIND:
826 {
827 Py_UCS2 ch2 = (Py_UCS2) ch;
828 if (ch2 == ch)
829 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
830 else
831 return -1;
832 }
833 case PyUnicode_4BYTE_KIND:
834 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
835 default:
836 assert(0);
837 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839}
840
Victor Stinnerafffce42012-10-03 23:03:17 +0200841#ifdef Py_DEBUG
842/* Fill the data of an Unicode string with invalid characters to detect bugs
843 earlier.
844
845 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
846 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
847 invalid character in Unicode 6.0. */
848static void
849unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
850{
851 int kind = PyUnicode_KIND(unicode);
852 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
853 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
854 if (length <= old_length)
855 return;
856 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
857}
858#endif
859
Victor Stinnerfe226c02011-10-03 03:52:20 +0200860static PyObject*
861resize_compact(PyObject *unicode, Py_ssize_t length)
862{
863 Py_ssize_t char_size;
864 Py_ssize_t struct_size;
865 Py_ssize_t new_size;
866 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100867 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200868#ifdef Py_DEBUG
869 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
870#endif
871
Victor Stinner79891572012-05-03 13:43:07 +0200872 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200873 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100874 assert(PyUnicode_IS_COMPACT(unicode));
875
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200876 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878 struct_size = sizeof(PyASCIIObject);
879 else
880 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200881 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200882
Victor Stinnerfe226c02011-10-03 03:52:20 +0200883 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
884 PyErr_NoMemory();
885 return NULL;
886 }
887 new_size = (struct_size + (length + 1) * char_size);
888
Victor Stinner84def372011-12-11 20:04:56 +0100889 _Py_DEC_REFTOTAL;
890 _Py_ForgetReference(unicode);
891
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300892 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100893 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100894 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895 PyErr_NoMemory();
896 return NULL;
897 }
Victor Stinner84def372011-12-11 20:04:56 +0100898 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200899 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200902 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200903 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100904 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100907 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_WSTR(unicode));
909 _PyUnicode_WSTR(unicode) = NULL;
910 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200911#ifdef Py_DEBUG
912 unicode_fill_invalid(unicode, old_length);
913#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200914 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
915 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200916 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917 return unicode;
918}
919
Alexander Belopolsky40018472011-02-26 01:02:56 +0000920static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200921resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000922{
Victor Stinner95663112011-10-04 01:03:50 +0200923 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100924 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000927
Victor Stinnerfe226c02011-10-03 03:52:20 +0200928 if (PyUnicode_IS_READY(unicode)) {
929 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200930 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200931 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200932#ifdef Py_DEBUG
933 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
934#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935
936 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200937 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200938 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
939 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200940
941 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
942 PyErr_NoMemory();
943 return -1;
944 }
945 new_size = (length + 1) * char_size;
946
Victor Stinner7a9105a2011-12-12 00:13:42 +0100947 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
948 {
949 PyObject_DEL(_PyUnicode_UTF8(unicode));
950 _PyUnicode_UTF8(unicode) = NULL;
951 _PyUnicode_UTF8_LENGTH(unicode) = 0;
952 }
953
Victor Stinnerfe226c02011-10-03 03:52:20 +0200954 data = (PyObject *)PyObject_REALLOC(data, new_size);
955 if (data == NULL) {
956 PyErr_NoMemory();
957 return -1;
958 }
959 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200960 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200962 _PyUnicode_WSTR_LENGTH(unicode) = length;
963 }
964 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200965 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200966 _PyUnicode_UTF8_LENGTH(unicode) = length;
967 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 _PyUnicode_LENGTH(unicode) = length;
969 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200970#ifdef Py_DEBUG
971 unicode_fill_invalid(unicode, old_length);
972#endif
Victor Stinner95663112011-10-04 01:03:50 +0200973 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200974 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200975 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200976 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977 }
Victor Stinner95663112011-10-04 01:03:50 +0200978 assert(_PyUnicode_WSTR(unicode) != NULL);
979
980 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700981 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200982 PyErr_NoMemory();
983 return -1;
984 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100985 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200986 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100987 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200988 if (!wstr) {
989 PyErr_NoMemory();
990 return -1;
991 }
992 _PyUnicode_WSTR(unicode) = wstr;
993 _PyUnicode_WSTR(unicode)[length] = 0;
994 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200995 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996 return 0;
997}
998
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999static PyObject*
1000resize_copy(PyObject *unicode, Py_ssize_t length)
1001{
1002 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001003 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001004 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001005
Benjamin Petersonbac79492012-01-14 13:34:47 -05001006 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001007 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001008
1009 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1010 if (copy == NULL)
1011 return NULL;
1012
1013 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001014 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001015 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001016 }
1017 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001018 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001019
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001020 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021 if (w == NULL)
1022 return NULL;
1023 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1024 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001025 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1026 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001027 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 }
1029}
1030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001032 Ux0000 terminated; some code (e.g. new_identifier)
1033 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034
1035 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001036 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037
1038*/
1039
Alexander Belopolsky40018472011-02-26 01:02:56 +00001040static PyUnicodeObject *
1041_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001043 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
Thomas Wouters477c8d52006-05-27 19:21:47 +00001046 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (length == 0 && unicode_empty != NULL) {
1048 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001049 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 }
1051
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001052 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001053 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001054 return (PyUnicodeObject *)PyErr_NoMemory();
1055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 if (length < 0) {
1057 PyErr_SetString(PyExc_SystemError,
1058 "Negative size passed to _PyUnicode_New");
1059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 }
1061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1063 if (unicode == NULL)
1064 return NULL;
1065 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001066
1067 _PyUnicode_WSTR_LENGTH(unicode) = length;
1068 _PyUnicode_HASH(unicode) = -1;
1069 _PyUnicode_STATE(unicode).interned = 0;
1070 _PyUnicode_STATE(unicode).kind = 0;
1071 _PyUnicode_STATE(unicode).compact = 0;
1072 _PyUnicode_STATE(unicode).ready = 0;
1073 _PyUnicode_STATE(unicode).ascii = 0;
1074 _PyUnicode_DATA_ANY(unicode) = NULL;
1075 _PyUnicode_LENGTH(unicode) = 0;
1076 _PyUnicode_UTF8(unicode) = NULL;
1077 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1080 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001081 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001082 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001083 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085
Jeremy Hyltond8082792003-09-16 19:41:39 +00001086 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001087 * the caller fails before initializing str -- unicode_resize()
1088 * reads str[0], and the Keep-Alive optimization can keep memory
1089 * allocated for str alive across a call to unicode_dealloc(unicode).
1090 * We don't want unicode_resize to read uninitialized memory in
1091 * that case.
1092 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 _PyUnicode_WSTR(unicode)[0] = 0;
1094 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001095
Victor Stinner7931d9a2011-11-04 00:22:48 +01001096 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097 return unicode;
1098}
1099
Victor Stinnerf42dc442011-10-02 23:33:16 +02001100static const char*
1101unicode_kind_name(PyObject *unicode)
1102{
Victor Stinner42dfd712011-10-03 14:41:45 +02001103 /* don't check consistency: unicode_kind_name() is called from
1104 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001105 if (!PyUnicode_IS_COMPACT(unicode))
1106 {
1107 if (!PyUnicode_IS_READY(unicode))
1108 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001109 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001110 {
1111 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001112 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001113 return "legacy ascii";
1114 else
1115 return "legacy latin1";
1116 case PyUnicode_2BYTE_KIND:
1117 return "legacy UCS2";
1118 case PyUnicode_4BYTE_KIND:
1119 return "legacy UCS4";
1120 default:
1121 return "<legacy invalid kind>";
1122 }
1123 }
1124 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001125 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001126 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001127 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 return "ascii";
1129 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001130 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001132 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001133 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001134 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 default:
1136 return "<invalid compact kind>";
1137 }
1138}
1139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141/* Functions wrapping macros for use in debugger */
1142char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001143 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144}
1145
1146void *_PyUnicode_compact_data(void *unicode) {
1147 return _PyUnicode_COMPACT_DATA(unicode);
1148}
1149void *_PyUnicode_data(void *unicode){
1150 printf("obj %p\n", unicode);
1151 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1152 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1153 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1154 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1155 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1156 return PyUnicode_DATA(unicode);
1157}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001158
1159void
1160_PyUnicode_Dump(PyObject *op)
1161{
1162 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001163 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1164 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1165 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001166
Victor Stinnera849a4b2011-10-03 12:12:11 +02001167 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001168 {
1169 if (ascii->state.ascii)
1170 data = (ascii + 1);
1171 else
1172 data = (compact + 1);
1173 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001174 else
1175 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001176 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1177 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001178
Victor Stinnera849a4b2011-10-03 12:12:11 +02001179 if (ascii->wstr == data)
1180 printf("shared ");
1181 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001182
Victor Stinnera3b334d2011-10-03 13:53:37 +02001183 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001184 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001185 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1186 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001187 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1188 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001189 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001190 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001191}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192#endif
1193
1194PyObject *
1195PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1196{
1197 PyObject *obj;
1198 PyCompactUnicodeObject *unicode;
1199 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001200 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001201 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202 Py_ssize_t char_size;
1203 Py_ssize_t struct_size;
1204
1205 /* Optimization for empty strings */
1206 if (size == 0 && unicode_empty != NULL) {
1207 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001208 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209 }
1210
Victor Stinner9e9d6892011-10-04 01:02:02 +02001211 is_ascii = 0;
1212 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 struct_size = sizeof(PyCompactUnicodeObject);
1214 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001215 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 char_size = 1;
1217 is_ascii = 1;
1218 struct_size = sizeof(PyASCIIObject);
1219 }
1220 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001221 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 char_size = 1;
1223 }
1224 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001225 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 char_size = 2;
1227 if (sizeof(wchar_t) == 2)
1228 is_sharing = 1;
1229 }
1230 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001231 if (maxchar > MAX_UNICODE) {
1232 PyErr_SetString(PyExc_SystemError,
1233 "invalid maximum character passed to PyUnicode_New");
1234 return NULL;
1235 }
Victor Stinner8f825062012-04-27 13:55:39 +02001236 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 char_size = 4;
1238 if (sizeof(wchar_t) == 4)
1239 is_sharing = 1;
1240 }
1241
1242 /* Ensure we won't overflow the size. */
1243 if (size < 0) {
1244 PyErr_SetString(PyExc_SystemError,
1245 "Negative size passed to PyUnicode_New");
1246 return NULL;
1247 }
1248 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1249 return PyErr_NoMemory();
1250
1251 /* Duplicated allocation code from _PyObject_New() instead of a call to
1252 * PyObject_New() so we are able to allocate space for the object and
1253 * it's data buffer.
1254 */
1255 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1256 if (obj == NULL)
1257 return PyErr_NoMemory();
1258 obj = PyObject_INIT(obj, &PyUnicode_Type);
1259 if (obj == NULL)
1260 return NULL;
1261
1262 unicode = (PyCompactUnicodeObject *)obj;
1263 if (is_ascii)
1264 data = ((PyASCIIObject*)obj) + 1;
1265 else
1266 data = unicode + 1;
1267 _PyUnicode_LENGTH(unicode) = size;
1268 _PyUnicode_HASH(unicode) = -1;
1269 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001270 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271 _PyUnicode_STATE(unicode).compact = 1;
1272 _PyUnicode_STATE(unicode).ready = 1;
1273 _PyUnicode_STATE(unicode).ascii = is_ascii;
1274 if (is_ascii) {
1275 ((char*)data)[size] = 0;
1276 _PyUnicode_WSTR(unicode) = NULL;
1277 }
Victor Stinner8f825062012-04-27 13:55:39 +02001278 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 ((char*)data)[size] = 0;
1280 _PyUnicode_WSTR(unicode) = NULL;
1281 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001283 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 else {
1286 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001287 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001288 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001290 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291 ((Py_UCS4*)data)[size] = 0;
1292 if (is_sharing) {
1293 _PyUnicode_WSTR_LENGTH(unicode) = size;
1294 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1295 }
1296 else {
1297 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1298 _PyUnicode_WSTR(unicode) = NULL;
1299 }
1300 }
Victor Stinner8f825062012-04-27 13:55:39 +02001301#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001302 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001303#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001304 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return obj;
1306}
1307
1308#if SIZEOF_WCHAR_T == 2
1309/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1310 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001311 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312
1313 This function assumes that unicode can hold one more code point than wstr
1314 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001315static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001317 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318{
1319 const wchar_t *iter;
1320 Py_UCS4 *ucs4_out;
1321
Victor Stinner910337b2011-10-03 03:20:16 +02001322 assert(unicode != NULL);
1323 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1325 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1326
1327 for (iter = begin; iter < end; ) {
1328 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1329 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001330 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1331 && (iter+1) < end
1332 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 {
Victor Stinner551ac952011-11-29 22:58:13 +01001334 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 iter += 2;
1336 }
1337 else {
1338 *ucs4_out++ = *iter;
1339 iter++;
1340 }
1341 }
1342 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1343 _PyUnicode_GET_LENGTH(unicode)));
1344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345}
1346#endif
1347
Victor Stinnercd9950f2011-10-02 00:34:53 +02001348static int
Victor Stinner488fa492011-12-12 00:01:39 +01001349unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001350{
Victor Stinner488fa492011-12-12 00:01:39 +01001351 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001352 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001353 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001354 return -1;
1355 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001356 return 0;
1357}
1358
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001359static int
1360_copy_characters(PyObject *to, Py_ssize_t to_start,
1361 PyObject *from, Py_ssize_t from_start,
1362 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001364 unsigned int from_kind, to_kind;
1365 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366
Victor Stinneree4544c2012-05-09 22:24:08 +02001367 assert(0 <= how_many);
1368 assert(0 <= from_start);
1369 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001371 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001372 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373
Victor Stinnerd3f08822012-05-29 12:57:52 +02001374 assert(PyUnicode_Check(to));
1375 assert(PyUnicode_IS_READY(to));
1376 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1377
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001378 if (how_many == 0)
1379 return 0;
1380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001382 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001384 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385
Victor Stinnerf1852262012-06-16 16:38:26 +02001386#ifdef Py_DEBUG
1387 if (!check_maxchar
1388 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1389 {
1390 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1391 Py_UCS4 ch;
1392 Py_ssize_t i;
1393 for (i=0; i < how_many; i++) {
1394 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1395 assert(ch <= to_maxchar);
1396 }
1397 }
1398#endif
1399
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001400 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001401 if (check_maxchar
1402 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1403 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001404 /* Writing Latin-1 characters into an ASCII string requires to
1405 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001406 Py_UCS4 max_char;
1407 max_char = ucs1lib_find_max_char(from_data,
1408 (Py_UCS1*)from_data + how_many);
1409 if (max_char >= 128)
1410 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001411 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001412 Py_MEMCPY((char*)to_data + to_kind * to_start,
1413 (char*)from_data + from_kind * from_start,
1414 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001416 else if (from_kind == PyUnicode_1BYTE_KIND
1417 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001418 {
1419 _PyUnicode_CONVERT_BYTES(
1420 Py_UCS1, Py_UCS2,
1421 PyUnicode_1BYTE_DATA(from) + from_start,
1422 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1423 PyUnicode_2BYTE_DATA(to) + to_start
1424 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001425 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001426 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001427 && to_kind == PyUnicode_4BYTE_KIND)
1428 {
1429 _PyUnicode_CONVERT_BYTES(
1430 Py_UCS1, Py_UCS4,
1431 PyUnicode_1BYTE_DATA(from) + from_start,
1432 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1433 PyUnicode_4BYTE_DATA(to) + to_start
1434 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001435 }
1436 else if (from_kind == PyUnicode_2BYTE_KIND
1437 && to_kind == PyUnicode_4BYTE_KIND)
1438 {
1439 _PyUnicode_CONVERT_BYTES(
1440 Py_UCS2, Py_UCS4,
1441 PyUnicode_2BYTE_DATA(from) + from_start,
1442 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1443 PyUnicode_4BYTE_DATA(to) + to_start
1444 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001445 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001446 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001447 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1448
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001449 if (!check_maxchar) {
1450 if (from_kind == PyUnicode_2BYTE_KIND
1451 && to_kind == PyUnicode_1BYTE_KIND)
1452 {
1453 _PyUnicode_CONVERT_BYTES(
1454 Py_UCS2, Py_UCS1,
1455 PyUnicode_2BYTE_DATA(from) + from_start,
1456 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1457 PyUnicode_1BYTE_DATA(to) + to_start
1458 );
1459 }
1460 else if (from_kind == PyUnicode_4BYTE_KIND
1461 && to_kind == PyUnicode_1BYTE_KIND)
1462 {
1463 _PyUnicode_CONVERT_BYTES(
1464 Py_UCS4, Py_UCS1,
1465 PyUnicode_4BYTE_DATA(from) + from_start,
1466 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1467 PyUnicode_1BYTE_DATA(to) + to_start
1468 );
1469 }
1470 else if (from_kind == PyUnicode_4BYTE_KIND
1471 && to_kind == PyUnicode_2BYTE_KIND)
1472 {
1473 _PyUnicode_CONVERT_BYTES(
1474 Py_UCS4, Py_UCS2,
1475 PyUnicode_4BYTE_DATA(from) + from_start,
1476 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1477 PyUnicode_2BYTE_DATA(to) + to_start
1478 );
1479 }
1480 else {
1481 assert(0);
1482 return -1;
1483 }
1484 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001485 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001487 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 Py_ssize_t i;
1489
Victor Stinnera0702ab2011-09-29 14:14:38 +02001490 for (i=0; i < how_many; i++) {
1491 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001492 if (ch > to_maxchar)
1493 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001494 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1495 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001496 }
1497 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001498 return 0;
1499}
1500
Victor Stinnerd3f08822012-05-29 12:57:52 +02001501void
1502_PyUnicode_FastCopyCharacters(
1503 PyObject *to, Py_ssize_t to_start,
1504 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001505{
1506 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1507}
1508
1509Py_ssize_t
1510PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1511 PyObject *from, Py_ssize_t from_start,
1512 Py_ssize_t how_many)
1513{
1514 int err;
1515
1516 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1517 PyErr_BadInternalCall();
1518 return -1;
1519 }
1520
Benjamin Petersonbac79492012-01-14 13:34:47 -05001521 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001522 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001523 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 return -1;
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526 if (from_start < 0) {
1527 PyErr_SetString(PyExc_IndexError, "string index out of range");
1528 return -1;
1529 }
1530 if (to_start < 0) {
1531 PyErr_SetString(PyExc_IndexError, "string index out of range");
1532 return -1;
1533 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001534 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1535 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1536 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001537 "Cannot write %zi characters at %zi "
1538 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001539 how_many, to_start, PyUnicode_GET_LENGTH(to));
1540 return -1;
1541 }
1542
1543 if (how_many == 0)
1544 return 0;
1545
Victor Stinner488fa492011-12-12 00:01:39 +01001546 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
1548
1549 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1550 if (err) {
1551 PyErr_Format(PyExc_SystemError,
1552 "Cannot copy %s characters "
1553 "into a string of %s characters",
1554 unicode_kind_name(from),
1555 unicode_kind_name(to));
1556 return -1;
1557 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001558 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559}
1560
Victor Stinner17222162011-09-28 22:15:37 +02001561/* Find the maximum code point and count the number of surrogate pairs so a
1562 correct string length can be computed before converting a string to UCS4.
1563 This function counts single surrogates as a character and not as a pair.
1564
1565 Return 0 on success, or -1 on error. */
1566static int
1567find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1568 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569{
1570 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001571 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572
Victor Stinnerc53be962011-10-02 21:33:54 +02001573 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 *num_surrogates = 0;
1575 *maxchar = 0;
1576
1577 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001579 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1580 && (iter+1) < end
1581 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1582 {
1583 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1584 ++(*num_surrogates);
1585 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 }
1587 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001589 {
1590 ch = *iter;
1591 iter++;
1592 }
1593 if (ch > *maxchar) {
1594 *maxchar = ch;
1595 if (*maxchar > MAX_UNICODE) {
1596 PyErr_Format(PyExc_ValueError,
1597 "character U+%x is not in range [U+0000; U+10ffff]",
1598 ch);
1599 return -1;
1600 }
1601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 }
1603 return 0;
1604}
1605
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001606int
1607_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608{
1609 wchar_t *end;
1610 Py_UCS4 maxchar = 0;
1611 Py_ssize_t num_surrogates;
1612#if SIZEOF_WCHAR_T == 2
1613 Py_ssize_t length_wo_surrogates;
1614#endif
1615
Georg Brandl7597add2011-10-05 16:36:47 +02001616 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001617 strings were created using _PyObject_New() and where no canonical
1618 representation (the str field) has been set yet aka strings
1619 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001620 assert(_PyUnicode_CHECK(unicode));
1621 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001623 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001624 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001625 /* Actually, it should neither be interned nor be anything else: */
1626 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001629 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001630 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632
1633 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001634 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1635 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 PyErr_NoMemory();
1637 return -1;
1638 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001639 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 _PyUnicode_WSTR(unicode), end,
1641 PyUnicode_1BYTE_DATA(unicode));
1642 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1643 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1644 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1645 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001646 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001647 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001648 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 }
1650 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001651 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001652 _PyUnicode_UTF8(unicode) = NULL;
1653 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 }
1655 PyObject_FREE(_PyUnicode_WSTR(unicode));
1656 _PyUnicode_WSTR(unicode) = NULL;
1657 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1658 }
1659 /* In this case we might have to convert down from 4-byte native
1660 wchar_t to 2-byte unicode. */
1661 else if (maxchar < 65536) {
1662 assert(num_surrogates == 0 &&
1663 "FindMaxCharAndNumSurrogatePairs() messed up");
1664
Victor Stinner506f5922011-09-28 22:34:18 +02001665#if SIZEOF_WCHAR_T == 2
1666 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001667 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001668 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1669 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1670 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001671 _PyUnicode_UTF8(unicode) = NULL;
1672 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001673#else
1674 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001675 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001676 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001677 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001678 PyErr_NoMemory();
1679 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 }
Victor Stinner506f5922011-09-28 22:34:18 +02001681 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1682 _PyUnicode_WSTR(unicode), end,
1683 PyUnicode_2BYTE_DATA(unicode));
1684 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1685 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1686 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001687 _PyUnicode_UTF8(unicode) = NULL;
1688 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001689 PyObject_FREE(_PyUnicode_WSTR(unicode));
1690 _PyUnicode_WSTR(unicode) = NULL;
1691 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1692#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1695 else {
1696#if SIZEOF_WCHAR_T == 2
1697 /* in case the native representation is 2-bytes, we need to allocate a
1698 new normalized 4-byte version. */
1699 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001700 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1701 PyErr_NoMemory();
1702 return -1;
1703 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 PyErr_NoMemory();
1707 return -1;
1708 }
1709 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1710 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001711 _PyUnicode_UTF8(unicode) = NULL;
1712 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001713 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1714 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001715 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 PyObject_FREE(_PyUnicode_WSTR(unicode));
1717 _PyUnicode_WSTR(unicode) = NULL;
1718 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1719#else
1720 assert(num_surrogates == 0);
1721
Victor Stinnerc3c74152011-10-02 20:39:55 +02001722 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001724 _PyUnicode_UTF8(unicode) = NULL;
1725 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1727#endif
1728 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1729 }
1730 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001731 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 return 0;
1733}
1734
Alexander Belopolsky40018472011-02-26 01:02:56 +00001735static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001736unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737{
Walter Dörwald16807132007-05-25 13:52:07 +00001738 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001739 case SSTATE_NOT_INTERNED:
1740 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001741
Benjamin Peterson29060642009-01-31 22:14:21 +00001742 case SSTATE_INTERNED_MORTAL:
1743 /* revive dead object temporarily for DelItem */
1744 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001745 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001746 Py_FatalError(
1747 "deletion of interned string failed");
1748 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 case SSTATE_INTERNED_IMMORTAL:
1751 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001752
Benjamin Peterson29060642009-01-31 22:14:21 +00001753 default:
1754 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001755 }
1756
Victor Stinner03490912011-10-03 23:45:12 +02001757 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001759 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001760 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001761 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1762 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001764 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765}
1766
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001767#ifdef Py_DEBUG
1768static int
1769unicode_is_singleton(PyObject *unicode)
1770{
1771 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1772 if (unicode == unicode_empty)
1773 return 1;
1774 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1775 {
1776 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1777 if (ch < 256 && unicode_latin1[ch] == unicode)
1778 return 1;
1779 }
1780 return 0;
1781}
1782#endif
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784static int
Victor Stinner488fa492011-12-12 00:01:39 +01001785unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001786{
Victor Stinner488fa492011-12-12 00:01:39 +01001787 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001788 if (Py_REFCNT(unicode) != 1)
1789 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001790 if (_PyUnicode_HASH(unicode) != -1)
1791 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001792 if (PyUnicode_CHECK_INTERNED(unicode))
1793 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001794 if (!PyUnicode_CheckExact(unicode))
1795 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001796#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001797 /* singleton refcount is greater than 1 */
1798 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001799#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001800 return 1;
1801}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001802
Victor Stinnerfe226c02011-10-03 03:52:20 +02001803static int
1804unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1805{
1806 PyObject *unicode;
1807 Py_ssize_t old_length;
1808
1809 assert(p_unicode != NULL);
1810 unicode = *p_unicode;
1811
1812 assert(unicode != NULL);
1813 assert(PyUnicode_Check(unicode));
1814 assert(0 <= length);
1815
Victor Stinner910337b2011-10-03 03:20:16 +02001816 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 old_length = PyUnicode_WSTR_LENGTH(unicode);
1818 else
1819 old_length = PyUnicode_GET_LENGTH(unicode);
1820 if (old_length == length)
1821 return 0;
1822
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001823 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001824 _Py_INCREF_UNICODE_EMPTY();
1825 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001826 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001827 Py_DECREF(*p_unicode);
1828 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001829 return 0;
1830 }
1831
Victor Stinner488fa492011-12-12 00:01:39 +01001832 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 PyObject *copy = resize_copy(unicode, length);
1834 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001835 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001836 Py_DECREF(*p_unicode);
1837 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001838 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001839 }
1840
Victor Stinnerfe226c02011-10-03 03:52:20 +02001841 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001842 PyObject *new_unicode = resize_compact(unicode, length);
1843 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001844 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001845 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001846 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001848 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001849}
1850
Alexander Belopolsky40018472011-02-26 01:02:56 +00001851int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001852PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001853{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001854 PyObject *unicode;
1855 if (p_unicode == NULL) {
1856 PyErr_BadInternalCall();
1857 return -1;
1858 }
1859 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001860 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001861 {
1862 PyErr_BadInternalCall();
1863 return -1;
1864 }
1865 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001866}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001867
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001868/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001869
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001870 WARNING: The function doesn't copy the terminating null character and
1871 doesn't check the maximum character (may write a latin1 character in an
1872 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001873static void
1874unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1875 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001876{
1877 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1878 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001879 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001880
1881 switch (kind) {
1882 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001883 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001884#ifdef Py_DEBUG
1885 if (PyUnicode_IS_ASCII(unicode)) {
1886 Py_UCS4 maxchar = ucs1lib_find_max_char(
1887 (const Py_UCS1*)str,
1888 (const Py_UCS1*)str + len);
1889 assert(maxchar < 128);
1890 }
1891#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001892 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001893 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001894 }
1895 case PyUnicode_2BYTE_KIND: {
1896 Py_UCS2 *start = (Py_UCS2 *)data + index;
1897 Py_UCS2 *ucs2 = start;
1898 assert(index <= PyUnicode_GET_LENGTH(unicode));
1899
Victor Stinner184252a2012-06-16 02:57:41 +02001900 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001901 *ucs2 = (Py_UCS2)*str;
1902
1903 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001904 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001905 }
1906 default: {
1907 Py_UCS4 *start = (Py_UCS4 *)data + index;
1908 Py_UCS4 *ucs4 = start;
1909 assert(kind == PyUnicode_4BYTE_KIND);
1910 assert(index <= PyUnicode_GET_LENGTH(unicode));
1911
Victor Stinner184252a2012-06-16 02:57:41 +02001912 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001913 *ucs4 = (Py_UCS4)*str;
1914
1915 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001916 }
1917 }
1918}
1919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920static PyObject*
1921get_latin1_char(unsigned char ch)
1922{
Victor Stinnera464fc12011-10-02 20:39:30 +02001923 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001925 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 if (!unicode)
1927 return NULL;
1928 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001929 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 unicode_latin1[ch] = unicode;
1931 }
1932 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001933 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934}
1935
Victor Stinner985a82a2014-01-03 12:53:47 +01001936static PyObject*
1937unicode_char(Py_UCS4 ch)
1938{
1939 PyObject *unicode;
1940
1941 assert(ch <= MAX_UNICODE);
1942
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001943 if (ch < 256)
1944 return get_latin1_char(ch);
1945
Victor Stinner985a82a2014-01-03 12:53:47 +01001946 unicode = PyUnicode_New(1, ch);
1947 if (unicode == NULL)
1948 return NULL;
1949 switch (PyUnicode_KIND(unicode)) {
1950 case PyUnicode_1BYTE_KIND:
1951 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1952 break;
1953 case PyUnicode_2BYTE_KIND:
1954 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1955 break;
1956 default:
1957 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1958 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1959 }
1960 assert(_PyUnicode_CheckConsistency(unicode, 1));
1961 return unicode;
1962}
1963
Alexander Belopolsky40018472011-02-26 01:02:56 +00001964PyObject *
1965PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001967 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 Py_UCS4 maxchar = 0;
1969 Py_ssize_t num_surrogates;
1970
1971 if (u == NULL)
1972 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001974 /* If the Unicode data is known at construction time, we can apply
1975 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001978 if (size == 0)
1979 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 /* Single character Unicode objects in the Latin-1 range are
1982 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001983 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 return get_latin1_char((unsigned char)*u);
1985
1986 /* If not empty and not single character, copy the Unicode data
1987 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001988 if (find_maxchar_surrogates(u, u + size,
1989 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return NULL;
1991
Victor Stinner8faf8212011-12-08 22:14:11 +01001992 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 if (!unicode)
1994 return NULL;
1995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 switch (PyUnicode_KIND(unicode)) {
1997 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001998 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2000 break;
2001 case PyUnicode_2BYTE_KIND:
2002#if Py_UNICODE_SIZE == 2
2003 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2004#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002005 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2007#endif
2008 break;
2009 case PyUnicode_4BYTE_KIND:
2010#if SIZEOF_WCHAR_T == 2
2011 /* This is the only case which has to process surrogates, thus
2012 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002013 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014#else
2015 assert(num_surrogates == 0);
2016 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2017#endif
2018 break;
2019 default:
2020 assert(0 && "Impossible state");
2021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002022
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024}
2025
Alexander Belopolsky40018472011-02-26 01:02:56 +00002026PyObject *
2027PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002028{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002029 if (size < 0) {
2030 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002031 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002032 return NULL;
2033 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002034 if (u != NULL)
2035 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2036 else
2037 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002038}
2039
Alexander Belopolsky40018472011-02-26 01:02:56 +00002040PyObject *
2041PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002042{
2043 size_t size = strlen(u);
2044 if (size > PY_SSIZE_T_MAX) {
2045 PyErr_SetString(PyExc_OverflowError, "input too long");
2046 return NULL;
2047 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002048 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002049}
2050
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002051PyObject *
2052_PyUnicode_FromId(_Py_Identifier *id)
2053{
2054 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002055 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2056 strlen(id->string),
2057 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002058 if (!id->object)
2059 return NULL;
2060 PyUnicode_InternInPlace(&id->object);
2061 assert(!id->next);
2062 id->next = static_strings;
2063 static_strings = id;
2064 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002065 return id->object;
2066}
2067
2068void
2069_PyUnicode_ClearStaticStrings()
2070{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002071 _Py_Identifier *tmp, *s = static_strings;
2072 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002073 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002074 tmp = s->next;
2075 s->next = NULL;
2076 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002077 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002078 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002079}
2080
Benjamin Peterson0df54292012-03-26 14:50:32 -04002081/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002082
Victor Stinnerd3f08822012-05-29 12:57:52 +02002083PyObject*
2084_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002085{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002086 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002087 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002088 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002089#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002090 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002091#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002092 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002093 }
Victor Stinner785938e2011-12-11 20:09:03 +01002094 unicode = PyUnicode_New(size, 127);
2095 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002096 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002097 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2098 assert(_PyUnicode_CheckConsistency(unicode, 1));
2099 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002100}
2101
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002102static Py_UCS4
2103kind_maxchar_limit(unsigned int kind)
2104{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002105 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002106 case PyUnicode_1BYTE_KIND:
2107 return 0x80;
2108 case PyUnicode_2BYTE_KIND:
2109 return 0x100;
2110 case PyUnicode_4BYTE_KIND:
2111 return 0x10000;
2112 default:
2113 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002114 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002115 }
2116}
2117
Victor Stinnere6abb482012-05-02 01:15:40 +02002118Py_LOCAL_INLINE(Py_UCS4)
2119align_maxchar(Py_UCS4 maxchar)
2120{
2121 if (maxchar <= 127)
2122 return 127;
2123 else if (maxchar <= 255)
2124 return 255;
2125 else if (maxchar <= 65535)
2126 return 65535;
2127 else
2128 return MAX_UNICODE;
2129}
2130
Victor Stinner702c7342011-10-05 13:50:52 +02002131static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002132_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002135 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002136
Serhiy Storchaka678db842013-01-26 12:16:36 +02002137 if (size == 0)
2138 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002139 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002140 if (size == 1)
2141 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002143 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002144 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 if (!res)
2146 return NULL;
2147 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002148 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002150}
2151
Victor Stinnere57b1c02011-09-28 22:20:48 +02002152static PyObject*
2153_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154{
2155 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002156 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002157
Serhiy Storchaka678db842013-01-26 12:16:36 +02002158 if (size == 0)
2159 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002160 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002161 if (size == 1)
2162 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002163
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002164 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002165 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 if (!res)
2167 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002168 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002170 else {
2171 _PyUnicode_CONVERT_BYTES(
2172 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2173 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002174 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175 return res;
2176}
2177
Victor Stinnere57b1c02011-09-28 22:20:48 +02002178static PyObject*
2179_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180{
2181 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002182 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002183
Serhiy Storchaka678db842013-01-26 12:16:36 +02002184 if (size == 0)
2185 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002187 if (size == 1)
2188 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002189
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002190 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 if (!res)
2193 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002194 if (max_char < 256)
2195 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2196 PyUnicode_1BYTE_DATA(res));
2197 else if (max_char < 0x10000)
2198 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2199 PyUnicode_2BYTE_DATA(res));
2200 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002202 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return res;
2204}
2205
2206PyObject*
2207PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2208{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002209 if (size < 0) {
2210 PyErr_SetString(PyExc_ValueError, "size must be positive");
2211 return NULL;
2212 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002213 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002215 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002217 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002219 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002220 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
2222 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224}
2225
Victor Stinnerece58de2012-04-23 23:36:38 +02002226Py_UCS4
2227_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2228{
2229 enum PyUnicode_Kind kind;
2230 void *startptr, *endptr;
2231
2232 assert(PyUnicode_IS_READY(unicode));
2233 assert(0 <= start);
2234 assert(end <= PyUnicode_GET_LENGTH(unicode));
2235 assert(start <= end);
2236
2237 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2238 return PyUnicode_MAX_CHAR_VALUE(unicode);
2239
2240 if (start == end)
2241 return 127;
2242
Victor Stinner94d558b2012-04-27 22:26:58 +02002243 if (PyUnicode_IS_ASCII(unicode))
2244 return 127;
2245
Victor Stinnerece58de2012-04-23 23:36:38 +02002246 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002247 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002248 endptr = (char *)startptr + end * kind;
2249 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002250 switch(kind) {
2251 case PyUnicode_1BYTE_KIND:
2252 return ucs1lib_find_max_char(startptr, endptr);
2253 case PyUnicode_2BYTE_KIND:
2254 return ucs2lib_find_max_char(startptr, endptr);
2255 case PyUnicode_4BYTE_KIND:
2256 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002257 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002258 assert(0);
2259 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002260 }
2261}
2262
Victor Stinner25a4b292011-10-06 12:31:55 +02002263/* Ensure that a string uses the most efficient storage, if it is not the
2264 case: create a new string with of the right kind. Write NULL into *p_unicode
2265 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002266static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002267unicode_adjust_maxchar(PyObject **p_unicode)
2268{
2269 PyObject *unicode, *copy;
2270 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002271 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002272 unsigned int kind;
2273
2274 assert(p_unicode != NULL);
2275 unicode = *p_unicode;
2276 assert(PyUnicode_IS_READY(unicode));
2277 if (PyUnicode_IS_ASCII(unicode))
2278 return;
2279
2280 len = PyUnicode_GET_LENGTH(unicode);
2281 kind = PyUnicode_KIND(unicode);
2282 if (kind == PyUnicode_1BYTE_KIND) {
2283 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002284 max_char = ucs1lib_find_max_char(u, u + len);
2285 if (max_char >= 128)
2286 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002287 }
2288 else if (kind == PyUnicode_2BYTE_KIND) {
2289 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002290 max_char = ucs2lib_find_max_char(u, u + len);
2291 if (max_char >= 256)
2292 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002293 }
2294 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002295 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002296 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002297 max_char = ucs4lib_find_max_char(u, u + len);
2298 if (max_char >= 0x10000)
2299 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002300 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002301 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002302 if (copy != NULL)
2303 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002304 Py_DECREF(unicode);
2305 *p_unicode = copy;
2306}
2307
Victor Stinner034f6cf2011-09-30 02:26:44 +02002308PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002309_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002310{
Victor Stinner87af4f22011-11-21 23:03:47 +01002311 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002312 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002313
Victor Stinner034f6cf2011-09-30 02:26:44 +02002314 if (!PyUnicode_Check(unicode)) {
2315 PyErr_BadInternalCall();
2316 return NULL;
2317 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002318 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002319 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002320
Victor Stinner87af4f22011-11-21 23:03:47 +01002321 length = PyUnicode_GET_LENGTH(unicode);
2322 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002323 if (!copy)
2324 return NULL;
2325 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2326
Victor Stinner87af4f22011-11-21 23:03:47 +01002327 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2328 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002329 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002330 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331}
2332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333
Victor Stinnerbc603d12011-10-02 01:00:40 +02002334/* Widen Unicode objects to larger buffers. Don't write terminating null
2335 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336
2337void*
2338_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2339{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002340 Py_ssize_t len;
2341 void *result;
2342 unsigned int skind;
2343
Benjamin Petersonbac79492012-01-14 13:34:47 -05002344 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002345 return NULL;
2346
2347 len = PyUnicode_GET_LENGTH(s);
2348 skind = PyUnicode_KIND(s);
2349 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002350 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return NULL;
2352 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002353 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002354 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002355 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002356 if (!result)
2357 return PyErr_NoMemory();
2358 assert(skind == PyUnicode_1BYTE_KIND);
2359 _PyUnicode_CONVERT_BYTES(
2360 Py_UCS1, Py_UCS2,
2361 PyUnicode_1BYTE_DATA(s),
2362 PyUnicode_1BYTE_DATA(s) + len,
2363 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002364 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002365 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002366 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002367 if (!result)
2368 return PyErr_NoMemory();
2369 if (skind == PyUnicode_2BYTE_KIND) {
2370 _PyUnicode_CONVERT_BYTES(
2371 Py_UCS2, Py_UCS4,
2372 PyUnicode_2BYTE_DATA(s),
2373 PyUnicode_2BYTE_DATA(s) + len,
2374 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002376 else {
2377 assert(skind == PyUnicode_1BYTE_KIND);
2378 _PyUnicode_CONVERT_BYTES(
2379 Py_UCS1, Py_UCS4,
2380 PyUnicode_1BYTE_DATA(s),
2381 PyUnicode_1BYTE_DATA(s) + len,
2382 result);
2383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002385 default:
2386 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 }
Victor Stinner01698042011-10-04 00:04:26 +02002388 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 return NULL;
2390}
2391
2392static Py_UCS4*
2393as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2394 int copy_null)
2395{
2396 int kind;
2397 void *data;
2398 Py_ssize_t len, targetlen;
2399 if (PyUnicode_READY(string) == -1)
2400 return NULL;
2401 kind = PyUnicode_KIND(string);
2402 data = PyUnicode_DATA(string);
2403 len = PyUnicode_GET_LENGTH(string);
2404 targetlen = len;
2405 if (copy_null)
2406 targetlen++;
2407 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002408 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 if (!target) {
2410 PyErr_NoMemory();
2411 return NULL;
2412 }
2413 }
2414 else {
2415 if (targetsize < targetlen) {
2416 PyErr_Format(PyExc_SystemError,
2417 "string is longer than the buffer");
2418 if (copy_null && 0 < targetsize)
2419 target[0] = 0;
2420 return NULL;
2421 }
2422 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002423 if (kind == PyUnicode_1BYTE_KIND) {
2424 Py_UCS1 *start = (Py_UCS1 *) data;
2425 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002427 else if (kind == PyUnicode_2BYTE_KIND) {
2428 Py_UCS2 *start = (Py_UCS2 *) data;
2429 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2430 }
2431 else {
2432 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 if (copy_null)
2436 target[len] = 0;
2437 return target;
2438}
2439
2440Py_UCS4*
2441PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2442 int copy_null)
2443{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002444 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 PyErr_BadInternalCall();
2446 return NULL;
2447 }
2448 return as_ucs4(string, target, targetsize, copy_null);
2449}
2450
2451Py_UCS4*
2452PyUnicode_AsUCS4Copy(PyObject *string)
2453{
2454 return as_ucs4(string, NULL, 0, 1);
2455}
2456
2457#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002458
Alexander Belopolsky40018472011-02-26 01:02:56 +00002459PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002460PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002462 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002464 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 PyErr_BadInternalCall();
2466 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002467 }
2468
Martin v. Löwis790465f2008-04-05 20:41:37 +00002469 if (size == -1) {
2470 size = wcslen(w);
2471 }
2472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474}
2475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002477
Victor Stinner15a11362012-10-06 23:48:20 +02002478/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002479 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2480 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2481#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002482
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002483static int
2484unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2485 Py_ssize_t width, Py_ssize_t precision)
2486{
2487 Py_ssize_t length, fill, arglen;
2488 Py_UCS4 maxchar;
2489
2490 if (PyUnicode_READY(str) == -1)
2491 return -1;
2492
2493 length = PyUnicode_GET_LENGTH(str);
2494 if ((precision == -1 || precision >= length)
2495 && width <= length)
2496 return _PyUnicodeWriter_WriteStr(writer, str);
2497
2498 if (precision != -1)
2499 length = Py_MIN(precision, length);
2500
2501 arglen = Py_MAX(length, width);
2502 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2503 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2504 else
2505 maxchar = writer->maxchar;
2506
2507 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2508 return -1;
2509
2510 if (width > length) {
2511 fill = width - length;
2512 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2513 return -1;
2514 writer->pos += fill;
2515 }
2516
2517 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2518 str, 0, length);
2519 writer->pos += length;
2520 return 0;
2521}
2522
2523static int
2524unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2525 Py_ssize_t width, Py_ssize_t precision)
2526{
2527 /* UTF-8 */
2528 Py_ssize_t length;
2529 PyObject *unicode;
2530 int res;
2531
2532 length = strlen(str);
2533 if (precision != -1)
2534 length = Py_MIN(length, precision);
2535 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2536 if (unicode == NULL)
2537 return -1;
2538
2539 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2540 Py_DECREF(unicode);
2541 return res;
2542}
2543
Victor Stinner96865452011-03-01 23:44:09 +00002544static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002545unicode_fromformat_arg(_PyUnicodeWriter *writer,
2546 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002547{
Victor Stinnere215d962012-10-06 23:03:36 +02002548 const char *p;
2549 Py_ssize_t len;
2550 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002551 Py_ssize_t width;
2552 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 int longflag;
2554 int longlongflag;
2555 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002556 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002557
2558 p = f;
2559 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002560 zeropad = 0;
2561 if (*f == '0') {
2562 zeropad = 1;
2563 f++;
2564 }
Victor Stinner96865452011-03-01 23:44:09 +00002565
2566 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567 width = -1;
2568 if (Py_ISDIGIT((unsigned)*f)) {
2569 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002570 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002571 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002573 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002575 return NULL;
2576 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002577 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002578 f++;
2579 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 }
2581 precision = -1;
2582 if (*f == '.') {
2583 f++;
2584 if (Py_ISDIGIT((unsigned)*f)) {
2585 precision = (*f - '0');
2586 f++;
2587 while (Py_ISDIGIT((unsigned)*f)) {
2588 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2589 PyErr_SetString(PyExc_ValueError,
2590 "precision too big");
2591 return NULL;
2592 }
2593 precision = (precision * 10) + (*f - '0');
2594 f++;
2595 }
2596 }
Victor Stinner96865452011-03-01 23:44:09 +00002597 if (*f == '%') {
2598 /* "%.3%s" => f points to "3" */
2599 f--;
2600 }
2601 }
2602 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002603 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002604 f--;
2605 }
Victor Stinner96865452011-03-01 23:44:09 +00002606
2607 /* Handle %ld, %lu, %lld and %llu. */
2608 longflag = 0;
2609 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002610 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002611 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002612 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002613 longflag = 1;
2614 ++f;
2615 }
2616#ifdef HAVE_LONG_LONG
2617 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002618 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002619 longlongflag = 1;
2620 f += 2;
2621 }
2622#endif
2623 }
2624 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002625 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002626 size_tflag = 1;
2627 ++f;
2628 }
Victor Stinnere215d962012-10-06 23:03:36 +02002629
2630 if (f[1] == '\0')
2631 writer->overallocate = 0;
2632
2633 switch (*f) {
2634 case 'c':
2635 {
2636 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002637 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002638 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002639 "character argument not in range(0x110000)");
2640 return NULL;
2641 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002642 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002643 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002644 break;
2645 }
2646
2647 case 'i':
2648 case 'd':
2649 case 'u':
2650 case 'x':
2651 {
2652 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002653 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002654 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002655
2656 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002657 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002658 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002659 va_arg(*vargs, unsigned long));
2660#ifdef HAVE_LONG_LONG
2661 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002662 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002663 va_arg(*vargs, unsigned PY_LONG_LONG));
2664#endif
2665 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002666 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002667 va_arg(*vargs, size_t));
2668 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002669 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002670 va_arg(*vargs, unsigned int));
2671 }
2672 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002673 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002674 }
2675 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002677 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002678 va_arg(*vargs, long));
2679#ifdef HAVE_LONG_LONG
2680 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002682 va_arg(*vargs, PY_LONG_LONG));
2683#endif
2684 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, Py_ssize_t));
2687 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002688 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_arg(*vargs, int));
2690 }
2691 assert(len >= 0);
2692
Victor Stinnere215d962012-10-06 23:03:36 +02002693 if (precision < len)
2694 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002695
2696 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002697 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2698 return NULL;
2699
Victor Stinnere215d962012-10-06 23:03:36 +02002700 if (width > precision) {
2701 Py_UCS4 fillchar;
2702 fill = width - precision;
2703 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002704 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2705 return NULL;
2706 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002707 }
Victor Stinner15a11362012-10-06 23:48:20 +02002708 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002709 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002710 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2711 return NULL;
2712 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002713 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714
Victor Stinner4a587072013-11-19 12:54:53 +01002715 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2716 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002717 break;
2718 }
2719
2720 case 'p':
2721 {
2722 char number[MAX_LONG_LONG_CHARS];
2723
2724 len = sprintf(number, "%p", va_arg(*vargs, void*));
2725 assert(len >= 0);
2726
2727 /* %p is ill-defined: ensure leading 0x. */
2728 if (number[1] == 'X')
2729 number[1] = 'x';
2730 else if (number[1] != 'x') {
2731 memmove(number + 2, number,
2732 strlen(number) + 1);
2733 number[0] = '0';
2734 number[1] = 'x';
2735 len += 2;
2736 }
2737
Victor Stinner4a587072013-11-19 12:54:53 +01002738 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002739 return NULL;
2740 break;
2741 }
2742
2743 case 's':
2744 {
2745 /* UTF-8 */
2746 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002748 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002749 break;
2750 }
2751
2752 case 'U':
2753 {
2754 PyObject *obj = va_arg(*vargs, PyObject *);
2755 assert(obj && _PyUnicode_CHECK(obj));
2756
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002758 return NULL;
2759 break;
2760 }
2761
2762 case 'V':
2763 {
2764 PyObject *obj = va_arg(*vargs, PyObject *);
2765 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002766 if (obj) {
2767 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002768 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002769 return NULL;
2770 }
2771 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002772 assert(str != NULL);
2773 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002774 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002775 }
2776 break;
2777 }
2778
2779 case 'S':
2780 {
2781 PyObject *obj = va_arg(*vargs, PyObject *);
2782 PyObject *str;
2783 assert(obj);
2784 str = PyObject_Str(obj);
2785 if (!str)
2786 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002787 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002788 Py_DECREF(str);
2789 return NULL;
2790 }
2791 Py_DECREF(str);
2792 break;
2793 }
2794
2795 case 'R':
2796 {
2797 PyObject *obj = va_arg(*vargs, PyObject *);
2798 PyObject *repr;
2799 assert(obj);
2800 repr = PyObject_Repr(obj);
2801 if (!repr)
2802 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 Py_DECREF(repr);
2805 return NULL;
2806 }
2807 Py_DECREF(repr);
2808 break;
2809 }
2810
2811 case 'A':
2812 {
2813 PyObject *obj = va_arg(*vargs, PyObject *);
2814 PyObject *ascii;
2815 assert(obj);
2816 ascii = PyObject_ASCII(obj);
2817 if (!ascii)
2818 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002819 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002820 Py_DECREF(ascii);
2821 return NULL;
2822 }
2823 Py_DECREF(ascii);
2824 break;
2825 }
2826
2827 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002828 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002829 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002830 break;
2831
2832 default:
2833 /* if we stumble upon an unknown formatting code, copy the rest
2834 of the format string to the output string. (we cannot just
2835 skip the code, since there's no way to know what's in the
2836 argument list) */
2837 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002838 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002839 return NULL;
2840 f = p+len;
2841 return f;
2842 }
2843
2844 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002845 return f;
2846}
2847
Walter Dörwaldd2034312007-05-18 16:29:38 +00002848PyObject *
2849PyUnicode_FromFormatV(const char *format, va_list vargs)
2850{
Victor Stinnere215d962012-10-06 23:03:36 +02002851 va_list vargs2;
2852 const char *f;
2853 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854
Victor Stinner8f674cc2013-04-17 23:02:17 +02002855 _PyUnicodeWriter_Init(&writer);
2856 writer.min_length = strlen(format) + 100;
2857 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002858
2859 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2860 Copy it to be able to pass a reference to a subfunction. */
2861 Py_VA_COPY(vargs2, vargs);
2862
2863 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002865 f = unicode_fromformat_arg(&writer, f, &vargs2);
2866 if (f == NULL)
2867 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002869 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002870 const char *p;
2871 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
Victor Stinnere215d962012-10-06 23:03:36 +02002873 p = f;
2874 do
2875 {
2876 if ((unsigned char)*p > 127) {
2877 PyErr_Format(PyExc_ValueError,
2878 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2879 "string, got a non-ASCII byte: 0x%02x",
2880 (unsigned char)*p);
2881 return NULL;
2882 }
2883 p++;
2884 }
2885 while (*p != '\0' && *p != '%');
2886 len = p - f;
2887
2888 if (*p == '\0')
2889 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002890
2891 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002892 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002893
2894 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002895 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 return _PyUnicodeWriter_Finish(&writer);
2898
2899 fail:
2900 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002901 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002902}
2903
Walter Dörwaldd2034312007-05-18 16:29:38 +00002904PyObject *
2905PyUnicode_FromFormat(const char *format, ...)
2906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002907 PyObject* ret;
2908 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002909
2910#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002911 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002912#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002913 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002914#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 ret = PyUnicode_FromFormatV(format, vargs);
2916 va_end(vargs);
2917 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002918}
2919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920#ifdef HAVE_WCHAR_H
2921
Victor Stinner5593d8a2010-10-02 11:11:27 +00002922/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2923 convert a Unicode object to a wide character string.
2924
Victor Stinnerd88d9832011-09-06 02:00:05 +02002925 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002926 character) required to convert the unicode object. Ignore size argument.
2927
Victor Stinnerd88d9832011-09-06 02:00:05 +02002928 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002929 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002930 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002931static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002932unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002933 wchar_t *w,
2934 Py_ssize_t size)
2935{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002936 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 const wchar_t *wstr;
2938
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002939 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 if (wstr == NULL)
2941 return -1;
2942
Victor Stinner5593d8a2010-10-02 11:11:27 +00002943 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 if (size > res)
2945 size = res + 1;
2946 else
2947 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 return res;
2950 }
2951 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002953}
2954
2955Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002956PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002957 wchar_t *w,
2958 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959{
2960 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002961 PyErr_BadInternalCall();
2962 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002964 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965}
2966
Victor Stinner137c34c2010-09-29 10:25:54 +00002967wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002968PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002969 Py_ssize_t *size)
2970{
2971 wchar_t* buffer;
2972 Py_ssize_t buflen;
2973
2974 if (unicode == NULL) {
2975 PyErr_BadInternalCall();
2976 return NULL;
2977 }
2978
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002979 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002980 if (buflen == -1)
2981 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002982 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002983 if (buffer == NULL) {
2984 PyErr_NoMemory();
2985 return NULL;
2986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002988 if (buflen == -1) {
2989 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002990 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002991 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002992 if (size != NULL)
2993 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002994 return buffer;
2995}
2996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002997#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Alexander Belopolsky40018472011-02-26 01:02:56 +00002999PyObject *
3000PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003001{
Victor Stinner8faf8212011-12-08 22:14:11 +01003002 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 PyErr_SetString(PyExc_ValueError,
3004 "chr() arg not in range(0x110000)");
3005 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003006 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003007
Victor Stinner985a82a2014-01-03 12:53:47 +01003008 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003009}
3010
Alexander Belopolsky40018472011-02-26 01:02:56 +00003011PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003012PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003014 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003016 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003017 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003018 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 Py_INCREF(obj);
3020 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003021 }
3022 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 /* For a Unicode subtype that's not a Unicode object,
3024 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003025 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003026 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003027 PyErr_Format(PyExc_TypeError,
3028 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003029 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003030 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003031}
3032
Alexander Belopolsky40018472011-02-26 01:02:56 +00003033PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003034PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003035 const char *encoding,
3036 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003037{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003038 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003039 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003040
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 PyErr_BadInternalCall();
3043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003045
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003046 /* Decoding bytes objects is the most common case and should be fast */
3047 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003048 if (PyBytes_GET_SIZE(obj) == 0)
3049 _Py_RETURN_UNICODE_EMPTY();
3050 v = PyUnicode_Decode(
3051 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3052 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003053 return v;
3054 }
3055
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003056 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 PyErr_SetString(PyExc_TypeError,
3058 "decoding str is not supported");
3059 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003061
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003062 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3063 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3064 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02003065 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003066 Py_TYPE(obj)->tp_name);
3067 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003068 }
Tim Petersced69f82003-09-16 20:30:58 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 PyBuffer_Release(&buffer);
3072 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003074
Serhiy Storchaka05997252013-01-26 12:14:02 +02003075 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003077 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078}
3079
Victor Stinner600d3be2010-06-10 12:00:55 +00003080/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003081 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3082 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003083int
3084_Py_normalize_encoding(const char *encoding,
3085 char *lower,
3086 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003088 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003089 char *l;
3090 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003091
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003092 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003093 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003094 if (lower_len < 6)
3095 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003096 strcpy(lower, "utf-8");
3097 return 1;
3098 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003099 e = encoding;
3100 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003101 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003102 while (*e) {
3103 if (l == l_end)
3104 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003105 if (Py_ISUPPER(*e)) {
3106 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003107 }
3108 else if (*e == '_') {
3109 *l++ = '-';
3110 e++;
3111 }
3112 else {
3113 *l++ = *e++;
3114 }
3115 }
3116 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003117 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003118}
3119
Alexander Belopolsky40018472011-02-26 01:02:56 +00003120PyObject *
3121PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003122 Py_ssize_t size,
3123 const char *encoding,
3124 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003125{
3126 PyObject *buffer = NULL, *unicode;
3127 Py_buffer info;
3128 char lower[11]; /* Enough for any encoding shortcut */
3129
Fred Drakee4315f52000-05-09 19:53:39 +00003130 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003131 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003132 if ((strcmp(lower, "utf-8") == 0) ||
3133 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003134 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003135 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003136 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003137 (strcmp(lower, "iso-8859-1") == 0) ||
3138 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003139 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003140#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003141 else if (strcmp(lower, "mbcs") == 0)
3142 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003143#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003144 else if (strcmp(lower, "ascii") == 0)
3145 return PyUnicode_DecodeASCII(s, size, errors);
3146 else if (strcmp(lower, "utf-16") == 0)
3147 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3148 else if (strcmp(lower, "utf-32") == 0)
3149 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151
3152 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003153 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003154 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003155 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003156 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (buffer == NULL)
3158 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003159 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 if (unicode == NULL)
3161 goto onError;
3162 if (!PyUnicode_Check(unicode)) {
3163 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003164 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3165 "use codecs.decode() to decode to arbitrary types",
3166 encoding,
3167 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 Py_DECREF(unicode);
3169 goto onError;
3170 }
3171 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003172 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003173
Benjamin Peterson29060642009-01-31 22:14:21 +00003174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 Py_XDECREF(buffer);
3176 return NULL;
3177}
3178
Alexander Belopolsky40018472011-02-26 01:02:56 +00003179PyObject *
3180PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003181 const char *encoding,
3182 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003183{
3184 PyObject *v;
3185
3186 if (!PyUnicode_Check(unicode)) {
3187 PyErr_BadArgument();
3188 goto onError;
3189 }
3190
3191 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003192 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003193
3194 /* Decode via the codec registry */
3195 v = PyCodec_Decode(unicode, encoding, errors);
3196 if (v == NULL)
3197 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003198 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003201 return NULL;
3202}
3203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204PyObject *
3205PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003206 const char *encoding,
3207 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208{
3209 PyObject *v;
3210
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_BadArgument();
3213 goto onError;
3214 }
3215
3216 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218
3219 /* Decode via the codec registry */
3220 v = PyCodec_Decode(unicode, encoding, errors);
3221 if (v == NULL)
3222 goto onError;
3223 if (!PyUnicode_Check(v)) {
3224 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003225 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3226 "use codecs.decode() to decode to arbitrary types",
3227 encoding,
3228 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003229 Py_DECREF(v);
3230 goto onError;
3231 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003232 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003233
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235 return NULL;
3236}
3237
Alexander Belopolsky40018472011-02-26 01:02:56 +00003238PyObject *
3239PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003240 Py_ssize_t size,
3241 const char *encoding,
3242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243{
3244 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003245
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 unicode = PyUnicode_FromUnicode(s, size);
3247 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3250 Py_DECREF(unicode);
3251 return v;
3252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
3255PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 const char *encoding,
3257 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258{
3259 PyObject *v;
3260
3261 if (!PyUnicode_Check(unicode)) {
3262 PyErr_BadArgument();
3263 goto onError;
3264 }
3265
3266 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003268
3269 /* Encode via the codec registry */
3270 v = PyCodec_Encode(unicode, encoding, errors);
3271 if (v == NULL)
3272 goto onError;
3273 return v;
3274
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003276 return NULL;
3277}
3278
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279static size_t
3280wcstombs_errorpos(const wchar_t *wstr)
3281{
3282 size_t len;
3283#if SIZEOF_WCHAR_T == 2
3284 wchar_t buf[3];
3285#else
3286 wchar_t buf[2];
3287#endif
3288 char outbuf[MB_LEN_MAX];
3289 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291#if SIZEOF_WCHAR_T == 2
3292 buf[2] = 0;
3293#else
3294 buf[1] = 0;
3295#endif
3296 start = wstr;
3297 while (*wstr != L'\0')
3298 {
3299 previous = wstr;
3300#if SIZEOF_WCHAR_T == 2
3301 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3302 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3303 {
3304 buf[0] = wstr[0];
3305 buf[1] = wstr[1];
3306 wstr += 2;
3307 }
3308 else {
3309 buf[0] = *wstr;
3310 buf[1] = 0;
3311 wstr++;
3312 }
3313#else
3314 buf[0] = *wstr;
3315 wstr++;
3316#endif
3317 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003318 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003319 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320 }
3321
3322 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003323 return 0;
3324}
3325
Victor Stinner1b579672011-12-17 05:47:23 +01003326static int
3327locale_error_handler(const char *errors, int *surrogateescape)
3328{
Victor Stinner50149202015-09-22 00:26:54 +02003329 _Py_error_handler error_handler = get_error_handler(errors);
3330 switch (error_handler)
3331 {
3332 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003333 *surrogateescape = 0;
3334 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003335 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003336 *surrogateescape = 1;
3337 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003338 default:
3339 PyErr_Format(PyExc_ValueError,
3340 "only 'strict' and 'surrogateescape' error handlers "
3341 "are supported, not '%s'",
3342 errors);
3343 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003344 }
Victor Stinner1b579672011-12-17 05:47:23 +01003345}
3346
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003347PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003348PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003349{
3350 Py_ssize_t wlen, wlen2;
3351 wchar_t *wstr;
3352 PyObject *bytes = NULL;
3353 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003354 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003355 PyObject *exc;
3356 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003357 int surrogateescape;
3358
3359 if (locale_error_handler(errors, &surrogateescape) < 0)
3360 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003361
3362 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3363 if (wstr == NULL)
3364 return NULL;
3365
3366 wlen2 = wcslen(wstr);
3367 if (wlen2 != wlen) {
3368 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003369 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 return NULL;
3371 }
3372
3373 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003374 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003375 char *str;
3376
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003377 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 if (str == NULL) {
3379 if (error_pos == (size_t)-1) {
3380 PyErr_NoMemory();
3381 PyMem_Free(wstr);
3382 return NULL;
3383 }
3384 else {
3385 goto encode_error;
3386 }
3387 }
3388 PyMem_Free(wstr);
3389
3390 bytes = PyBytes_FromString(str);
3391 PyMem_Free(str);
3392 }
3393 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003394 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003395 size_t len, len2;
3396
3397 len = wcstombs(NULL, wstr, 0);
3398 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003399 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003400 goto encode_error;
3401 }
3402
3403 bytes = PyBytes_FromStringAndSize(NULL, len);
3404 if (bytes == NULL) {
3405 PyMem_Free(wstr);
3406 return NULL;
3407 }
3408
3409 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3410 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003411 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412 goto encode_error;
3413 }
3414 PyMem_Free(wstr);
3415 }
3416 return bytes;
3417
3418encode_error:
3419 errmsg = strerror(errno);
3420 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003421
3422 if (error_pos == (size_t)-1)
3423 error_pos = wcstombs_errorpos(wstr);
3424
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 PyMem_Free(wstr);
3426 Py_XDECREF(bytes);
3427
Victor Stinner2f197072011-12-17 07:08:30 +01003428 if (errmsg != NULL) {
3429 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003430 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003431 if (wstr != NULL) {
3432 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003433 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003434 } else
3435 errmsg = NULL;
3436 }
3437 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003438 reason = PyUnicode_FromString(
3439 "wcstombs() encountered an unencodable "
3440 "wide character");
3441 if (reason == NULL)
3442 return NULL;
3443
3444 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3445 "locale", unicode,
3446 (Py_ssize_t)error_pos,
3447 (Py_ssize_t)(error_pos+1),
3448 reason);
3449 Py_DECREF(reason);
3450 if (exc != NULL) {
3451 PyCodec_StrictErrors(exc);
3452 Py_XDECREF(exc);
3453 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003454 return NULL;
3455}
3456
Victor Stinnerad158722010-10-27 00:25:46 +00003457PyObject *
3458PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003459{
Victor Stinner99b95382011-07-04 14:23:54 +02003460#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003461 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003462#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003464#else
Victor Stinner793b5312011-04-27 00:24:21 +02003465 PyInterpreterState *interp = PyThreadState_GET()->interp;
3466 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3467 cannot use it to encode and decode filenames before it is loaded. Load
3468 the Python codec requires to encode at least its own filename. Use the C
3469 version of the locale codec until the codec registry is initialized and
3470 the Python codec is loaded.
3471
3472 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3473 cannot only rely on it: check also interp->fscodec_initialized for
3474 subinterpreters. */
3475 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003476 return PyUnicode_AsEncodedString(unicode,
3477 Py_FileSystemDefaultEncoding,
3478 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003479 }
3480 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003481 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003482 }
Victor Stinnerad158722010-10-27 00:25:46 +00003483#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003484}
3485
Alexander Belopolsky40018472011-02-26 01:02:56 +00003486PyObject *
3487PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003488 const char *encoding,
3489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
3491 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003492 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003493
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 if (!PyUnicode_Check(unicode)) {
3495 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 }
Fred Drakee4315f52000-05-09 19:53:39 +00003498
Fred Drakee4315f52000-05-09 19:53:39 +00003499 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003501 if ((strcmp(lower, "utf-8") == 0) ||
3502 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003503 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003504 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003506 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003507 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003508 }
Victor Stinner37296e82010-06-10 13:36:23 +00003509 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003510 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003511 (strcmp(lower, "iso-8859-1") == 0) ||
3512 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003514#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003515 else if (strcmp(lower, "mbcs") == 0)
3516 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003517#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003518 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521
3522 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003523 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003524 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003525 return NULL;
3526
3527 /* The normal path */
3528 if (PyBytes_Check(v))
3529 return v;
3530
3531 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003532 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003533 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003534 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003535
3536 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003537 "encoder %s returned bytearray instead of bytes; "
3538 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003539 encoding);
3540 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003541 Py_DECREF(v);
3542 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003543 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003545 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3546 Py_DECREF(v);
3547 return b;
3548 }
3549
3550 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003551 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3552 "use codecs.encode() to encode to arbitrary types",
3553 encoding,
3554 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003555 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003556 return NULL;
3557}
3558
Alexander Belopolsky40018472011-02-26 01:02:56 +00003559PyObject *
3560PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003561 const char *encoding,
3562 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003563{
3564 PyObject *v;
3565
3566 if (!PyUnicode_Check(unicode)) {
3567 PyErr_BadArgument();
3568 goto onError;
3569 }
3570
3571 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003572 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003573
3574 /* Encode via the codec registry */
3575 v = PyCodec_Encode(unicode, encoding, errors);
3576 if (v == NULL)
3577 goto onError;
3578 if (!PyUnicode_Check(v)) {
3579 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003580 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3581 "use codecs.encode() to encode to arbitrary types",
3582 encoding,
3583 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003584 Py_DECREF(v);
3585 goto onError;
3586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003588
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 return NULL;
3591}
3592
Victor Stinner2f197072011-12-17 07:08:30 +01003593static size_t
3594mbstowcs_errorpos(const char *str, size_t len)
3595{
3596#ifdef HAVE_MBRTOWC
3597 const char *start = str;
3598 mbstate_t mbs;
3599 size_t converted;
3600 wchar_t ch;
3601
3602 memset(&mbs, 0, sizeof mbs);
3603 while (len)
3604 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003605 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003606 if (converted == 0)
3607 /* Reached end of string */
3608 break;
3609 if (converted == (size_t)-1 || converted == (size_t)-2) {
3610 /* Conversion error or incomplete character */
3611 return str - start;
3612 }
3613 else {
3614 str += converted;
3615 len -= converted;
3616 }
3617 }
3618 /* failed to find the undecodable byte sequence */
3619 return 0;
3620#endif
3621 return 0;
3622}
3623
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003624PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003625PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003626 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003627{
3628 wchar_t smallbuf[256];
3629 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3630 wchar_t *wstr;
3631 size_t wlen, wlen2;
3632 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003633 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003634 size_t error_pos;
3635 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003636 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3637 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003638
3639 if (locale_error_handler(errors, &surrogateescape) < 0)
3640 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003641
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003642 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3643 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003644 return NULL;
3645 }
3646
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003647 if (surrogateescape) {
3648 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003649 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003650 if (wstr == NULL) {
3651 if (wlen == (size_t)-1)
3652 PyErr_NoMemory();
3653 else
3654 PyErr_SetFromErrno(PyExc_OSError);
3655 return NULL;
3656 }
3657
3658 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003659 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003660 }
3661 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003662 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003663#ifndef HAVE_BROKEN_MBSTOWCS
3664 wlen = mbstowcs(NULL, str, 0);
3665#else
3666 wlen = len;
3667#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003668 if (wlen == (size_t)-1)
3669 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003670 if (wlen+1 <= smallbuf_len) {
3671 wstr = smallbuf;
3672 }
3673 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003674 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003675 if (!wstr)
3676 return PyErr_NoMemory();
3677 }
3678
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679 wlen2 = mbstowcs(wstr, str, wlen+1);
3680 if (wlen2 == (size_t)-1) {
3681 if (wstr != smallbuf)
3682 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003683 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003684 }
3685#ifdef HAVE_BROKEN_MBSTOWCS
3686 assert(wlen2 == wlen);
3687#endif
3688 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3689 if (wstr != smallbuf)
3690 PyMem_Free(wstr);
3691 }
3692 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003693
3694decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003695 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003696 errmsg = strerror(errno);
3697 assert(errmsg != NULL);
3698
3699 error_pos = mbstowcs_errorpos(str, len);
3700 if (errmsg != NULL) {
3701 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003702 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003703 if (wstr != NULL) {
3704 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003705 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003706 }
Victor Stinner2f197072011-12-17 07:08:30 +01003707 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003708 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003709 reason = PyUnicode_FromString(
3710 "mbstowcs() encountered an invalid multibyte sequence");
3711 if (reason == NULL)
3712 return NULL;
3713
3714 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3715 "locale", str, len,
3716 (Py_ssize_t)error_pos,
3717 (Py_ssize_t)(error_pos+1),
3718 reason);
3719 Py_DECREF(reason);
3720 if (exc != NULL) {
3721 PyCodec_StrictErrors(exc);
3722 Py_XDECREF(exc);
3723 }
3724 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003725}
3726
3727PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003728PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003729{
3730 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003731 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003732}
3733
3734
3735PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003736PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003737 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003738 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3739}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003740
Christian Heimes5894ba72007-11-04 11:43:14 +00003741PyObject*
3742PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3743{
Victor Stinner99b95382011-07-04 14:23:54 +02003744#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003745 return PyUnicode_DecodeMBCS(s, size, NULL);
3746#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003747 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003748#else
Victor Stinner793b5312011-04-27 00:24:21 +02003749 PyInterpreterState *interp = PyThreadState_GET()->interp;
3750 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3751 cannot use it to encode and decode filenames before it is loaded. Load
3752 the Python codec requires to encode at least its own filename. Use the C
3753 version of the locale codec until the codec registry is initialized and
3754 the Python codec is loaded.
3755
3756 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3757 cannot only rely on it: check also interp->fscodec_initialized for
3758 subinterpreters. */
3759 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003760 return PyUnicode_Decode(s, size,
3761 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003762 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003763 }
3764 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003765 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003766 }
Victor Stinnerad158722010-10-27 00:25:46 +00003767#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003768}
3769
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770
3771int
3772PyUnicode_FSConverter(PyObject* arg, void* addr)
3773{
3774 PyObject *output = NULL;
3775 Py_ssize_t size;
3776 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003777 if (arg == NULL) {
3778 Py_DECREF(*(PyObject**)addr);
3779 return 1;
3780 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003781 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003782 output = arg;
3783 Py_INCREF(output);
3784 }
3785 else {
3786 arg = PyUnicode_FromObject(arg);
3787 if (!arg)
3788 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003789 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003790 Py_DECREF(arg);
3791 if (!output)
3792 return 0;
3793 if (!PyBytes_Check(output)) {
3794 Py_DECREF(output);
3795 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3796 return 0;
3797 }
3798 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003799 size = PyBytes_GET_SIZE(output);
3800 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003801 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003802 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003803 Py_DECREF(output);
3804 return 0;
3805 }
3806 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003807 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003808}
3809
3810
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003811int
3812PyUnicode_FSDecoder(PyObject* arg, void* addr)
3813{
3814 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003815 if (arg == NULL) {
3816 Py_DECREF(*(PyObject**)addr);
3817 return 1;
3818 }
3819 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003820 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003822 output = arg;
3823 Py_INCREF(output);
3824 }
3825 else {
3826 arg = PyBytes_FromObject(arg);
3827 if (!arg)
3828 return 0;
3829 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3830 PyBytes_GET_SIZE(arg));
3831 Py_DECREF(arg);
3832 if (!output)
3833 return 0;
3834 if (!PyUnicode_Check(output)) {
3835 Py_DECREF(output);
3836 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3837 return 0;
3838 }
3839 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003840 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003841 Py_DECREF(output);
3842 return 0;
3843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003845 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003846 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003847 Py_DECREF(output);
3848 return 0;
3849 }
3850 *(PyObject**)addr = output;
3851 return Py_CLEANUP_SUPPORTED;
3852}
3853
3854
Martin v. Löwis5b222132007-06-10 09:51:05 +00003855char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003857{
Christian Heimesf3863112007-11-22 07:46:41 +00003858 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003860 if (!PyUnicode_Check(unicode)) {
3861 PyErr_BadArgument();
3862 return NULL;
3863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003864 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003865 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003867 if (PyUnicode_UTF8(unicode) == NULL) {
3868 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3870 if (bytes == NULL)
3871 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3873 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003874 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 Py_DECREF(bytes);
3876 return NULL;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3879 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3880 PyBytes_AS_STRING(bytes),
3881 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 Py_DECREF(bytes);
3883 }
3884
3885 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003886 *psize = PyUnicode_UTF8_LENGTH(unicode);
3887 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003888}
3889
3890char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3894}
3895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896Py_UNICODE *
3897PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 const unsigned char *one_byte;
3900#if SIZEOF_WCHAR_T == 4
3901 const Py_UCS2 *two_bytes;
3902#else
3903 const Py_UCS4 *four_bytes;
3904 const Py_UCS4 *ucs4_end;
3905 Py_ssize_t num_surrogates;
3906#endif
3907 wchar_t *w;
3908 wchar_t *wchar_end;
3909
3910 if (!PyUnicode_Check(unicode)) {
3911 PyErr_BadArgument();
3912 return NULL;
3913 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003914 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 assert(_PyUnicode_KIND(unicode) != 0);
3917 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003919 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3922 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 num_surrogates = 0;
3924
3925 for (; four_bytes < ucs4_end; ++four_bytes) {
3926 if (*four_bytes > 0xFFFF)
3927 ++num_surrogates;
3928 }
3929
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003930 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3931 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3932 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 PyErr_NoMemory();
3934 return NULL;
3935 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003938 w = _PyUnicode_WSTR(unicode);
3939 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3940 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3942 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003943 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003945 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3946 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 }
3948 else
3949 *w = *four_bytes;
3950
3951 if (w > wchar_end) {
3952 assert(0 && "Miscalculated string end");
3953 }
3954 }
3955 *w = 0;
3956#else
3957 /* sizeof(wchar_t) == 4 */
3958 Py_FatalError("Impossible unicode object state, wstr and str "
3959 "should share memory already.");
3960 return NULL;
3961#endif
3962 }
3963 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003964 if ((size_t)_PyUnicode_LENGTH(unicode) >
3965 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3966 PyErr_NoMemory();
3967 return NULL;
3968 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003969 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3970 (_PyUnicode_LENGTH(unicode) + 1));
3971 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 PyErr_NoMemory();
3973 return NULL;
3974 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003975 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3976 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3977 w = _PyUnicode_WSTR(unicode);
3978 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3981 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 for (; w < wchar_end; ++one_byte, ++w)
3983 *w = *one_byte;
3984 /* null-terminate the wstr */
3985 *w = 0;
3986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003987 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003989 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 for (; w < wchar_end; ++two_bytes, ++w)
3991 *w = *two_bytes;
3992 /* null-terminate the wstr */
3993 *w = 0;
3994#else
3995 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003996 PyObject_FREE(_PyUnicode_WSTR(unicode));
3997 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 Py_FatalError("Impossible unicode object state, wstr "
3999 "and str should share memory already.");
4000 return NULL;
4001#endif
4002 }
4003 else {
4004 assert(0 && "This should never happen.");
4005 }
4006 }
4007 }
4008 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 *size = PyUnicode_WSTR_LENGTH(unicode);
4010 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004011}
4012
Alexander Belopolsky40018472011-02-26 01:02:56 +00004013Py_UNICODE *
4014PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017}
4018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019
Alexander Belopolsky40018472011-02-26 01:02:56 +00004020Py_ssize_t
4021PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022{
4023 if (!PyUnicode_Check(unicode)) {
4024 PyErr_BadArgument();
4025 goto onError;
4026 }
4027 return PyUnicode_GET_SIZE(unicode);
4028
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 return -1;
4031}
4032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033Py_ssize_t
4034PyUnicode_GetLength(PyObject *unicode)
4035{
Victor Stinner07621332012-06-16 04:53:46 +02004036 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 PyErr_BadArgument();
4038 return -1;
4039 }
Victor Stinner07621332012-06-16 04:53:46 +02004040 if (PyUnicode_READY(unicode) == -1)
4041 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 return PyUnicode_GET_LENGTH(unicode);
4043}
4044
4045Py_UCS4
4046PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4047{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004048 void *data;
4049 int kind;
4050
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004051 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4052 PyErr_BadArgument();
4053 return (Py_UCS4)-1;
4054 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004055 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004056 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 return (Py_UCS4)-1;
4058 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004059 data = PyUnicode_DATA(unicode);
4060 kind = PyUnicode_KIND(unicode);
4061 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062}
4063
4064int
4065PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4066{
4067 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004068 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 return -1;
4070 }
Victor Stinner488fa492011-12-12 00:01:39 +01004071 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004072 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004073 PyErr_SetString(PyExc_IndexError, "string index out of range");
4074 return -1;
4075 }
Victor Stinner488fa492011-12-12 00:01:39 +01004076 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004077 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004078 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4079 PyErr_SetString(PyExc_ValueError, "character out of range");
4080 return -1;
4081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4083 index, ch);
4084 return 0;
4085}
4086
Alexander Belopolsky40018472011-02-26 01:02:56 +00004087const char *
4088PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004089{
Victor Stinner42cb4622010-09-01 19:39:01 +00004090 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004091}
4092
Victor Stinner554f3f02010-06-16 23:33:54 +00004093/* create or adjust a UnicodeDecodeError */
4094static void
4095make_decode_exception(PyObject **exceptionObject,
4096 const char *encoding,
4097 const char *input, Py_ssize_t length,
4098 Py_ssize_t startpos, Py_ssize_t endpos,
4099 const char *reason)
4100{
4101 if (*exceptionObject == NULL) {
4102 *exceptionObject = PyUnicodeDecodeError_Create(
4103 encoding, input, length, startpos, endpos, reason);
4104 }
4105 else {
4106 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4107 goto onError;
4108 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4109 goto onError;
4110 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4111 goto onError;
4112 }
4113 return;
4114
4115onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004116 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004117}
4118
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004119#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120/* error handling callback helper:
4121 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004122 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 and adjust various state variables.
4124 return 0 on success, -1 on error
4125*/
4126
Alexander Belopolsky40018472011-02-26 01:02:56 +00004127static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004128unicode_decode_call_errorhandler_wchar(
4129 const char *errors, PyObject **errorHandler,
4130 const char *encoding, const char *reason,
4131 const char **input, const char **inend, Py_ssize_t *startinpos,
4132 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4133 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004135 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136
4137 PyObject *restuple = NULL;
4138 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004139 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004141 Py_ssize_t requiredsize;
4142 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144 wchar_t *repwstr;
4145 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004147 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4148 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 *errorHandler = PyCodec_LookupError(errors);
4152 if (*errorHandler == NULL)
4153 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 }
4155
Victor Stinner554f3f02010-06-16 23:33:54 +00004156 make_decode_exception(exceptionObject,
4157 encoding,
4158 *input, *inend - *input,
4159 *startinpos, *endinpos,
4160 reason);
4161 if (*exceptionObject == NULL)
4162 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163
4164 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4165 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004168 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 }
4171 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173
4174 /* Copy back the bytes variables, which might have been modified by the
4175 callback */
4176 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4177 if (!inputobj)
4178 goto onError;
4179 if (!PyBytes_Check(inputobj)) {
4180 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4181 }
4182 *input = PyBytes_AS_STRING(inputobj);
4183 insize = PyBytes_GET_SIZE(inputobj);
4184 *inend = *input + insize;
4185 /* we can DECREF safely, as the exception has another reference,
4186 so the object won't go away. */
4187 Py_DECREF(inputobj);
4188
4189 if (newpos<0)
4190 newpos = insize+newpos;
4191 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004192 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 goto onError;
4194 }
4195
4196 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4197 if (repwstr == NULL)
4198 goto onError;
4199 /* need more space? (at least enough for what we
4200 have+the replacement+the rest of the string (starting
4201 at the new input position), so we won't have to check space
4202 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004203 requiredsize = *outpos;
4204 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4205 goto overflow;
4206 requiredsize += repwlen;
4207 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4208 goto overflow;
4209 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004211 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004212 requiredsize = 2*outsize;
4213 if (unicode_resize(output, requiredsize) < 0)
4214 goto onError;
4215 }
4216 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4217 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218 *endinpos = newpos;
4219 *inptr = *input + newpos;
4220
4221 /* we made it! */
4222 Py_XDECREF(restuple);
4223 return 0;
4224
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004225 overflow:
4226 PyErr_SetString(PyExc_OverflowError,
4227 "decoded result is too long for a Python string");
4228
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229 onError:
4230 Py_XDECREF(restuple);
4231 return -1;
4232}
4233#endif /* HAVE_MBCS */
4234
4235static int
4236unicode_decode_call_errorhandler_writer(
4237 const char *errors, PyObject **errorHandler,
4238 const char *encoding, const char *reason,
4239 const char **input, const char **inend, Py_ssize_t *startinpos,
4240 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4241 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4242{
4243 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4244
4245 PyObject *restuple = NULL;
4246 PyObject *repunicode = NULL;
4247 Py_ssize_t insize;
4248 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004249 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250 PyObject *inputobj = NULL;
4251
4252 if (*errorHandler == NULL) {
4253 *errorHandler = PyCodec_LookupError(errors);
4254 if (*errorHandler == NULL)
4255 goto onError;
4256 }
4257
4258 make_decode_exception(exceptionObject,
4259 encoding,
4260 *input, *inend - *input,
4261 *startinpos, *endinpos,
4262 reason);
4263 if (*exceptionObject == NULL)
4264 goto onError;
4265
4266 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4267 if (restuple == NULL)
4268 goto onError;
4269 if (!PyTuple_Check(restuple)) {
4270 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4271 goto onError;
4272 }
4273 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004275
4276 /* Copy back the bytes variables, which might have been modified by the
4277 callback */
4278 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4279 if (!inputobj)
4280 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004281 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004283 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004284 *input = PyBytes_AS_STRING(inputobj);
4285 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004286 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004287 /* we can DECREF safely, as the exception has another reference,
4288 so the object won't go away. */
4289 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004293 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004294 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004296 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004297
Victor Stinner8f674cc2013-04-17 23:02:17 +02004298 if (PyUnicode_READY(repunicode) < 0)
4299 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004300 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004301 if (replen > 1) {
4302 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004303 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004304 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4305 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4306 goto onError;
4307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004309 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004311 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004312 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 Py_XDECREF(restuple);
4316 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321}
4322
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323/* --- UTF-7 Codec -------------------------------------------------------- */
4324
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325/* See RFC2152 for details. We encode conservatively and decode liberally. */
4326
4327/* Three simple macros defining base-64. */
4328
4329/* Is c a base-64 character? */
4330
4331#define IS_BASE64(c) \
4332 (((c) >= 'A' && (c) <= 'Z') || \
4333 ((c) >= 'a' && (c) <= 'z') || \
4334 ((c) >= '0' && (c) <= '9') || \
4335 (c) == '+' || (c) == '/')
4336
4337/* given that c is a base-64 character, what is its base-64 value? */
4338
4339#define FROM_BASE64(c) \
4340 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4341 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4342 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4343 (c) == '+' ? 62 : 63)
4344
4345/* What is the base-64 character of the bottom 6 bits of n? */
4346
4347#define TO_BASE64(n) \
4348 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4349
4350/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4351 * decoded as itself. We are permissive on decoding; the only ASCII
4352 * byte not decoding to itself is the + which begins a base64
4353 * string. */
4354
4355#define DECODE_DIRECT(c) \
4356 ((c) <= 127 && (c) != '+')
4357
4358/* The UTF-7 encoder treats ASCII characters differently according to
4359 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4360 * the above). See RFC2152. This array identifies these different
4361 * sets:
4362 * 0 : "Set D"
4363 * alphanumeric and '(),-./:?
4364 * 1 : "Set O"
4365 * !"#$%&*;<=>@[]^_`{|}
4366 * 2 : "whitespace"
4367 * ht nl cr sp
4368 * 3 : special (must be base64 encoded)
4369 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4370 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371
Tim Petersced69f82003-09-16 20:30:58 +00004372static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373char utf7_category[128] = {
4374/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4375 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4376/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4377 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4378/* sp ! " # $ % & ' ( ) * + , - . / */
4379 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4380/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4382/* @ A B C D E F G H I J K L M N O */
4383 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4384/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4386/* ` a b c d e f g h i j k l m n o */
4387 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4388/* p q r s t u v w x y z { | } ~ del */
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390};
4391
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392/* ENCODE_DIRECT: this character should be encoded as itself. The
4393 * answer depends on whether we are encoding set O as itself, and also
4394 * on whether we are encoding whitespace as itself. RFC2152 makes it
4395 * clear that the answers to these questions vary between
4396 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004397
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398#define ENCODE_DIRECT(c, directO, directWS) \
4399 ((c) < 128 && (c) > 0 && \
4400 ((utf7_category[(c)] == 0) || \
4401 (directWS && (utf7_category[(c)] == 2)) || \
4402 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403
Alexander Belopolsky40018472011-02-26 01:02:56 +00004404PyObject *
4405PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004406 Py_ssize_t size,
4407 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004409 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4410}
4411
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412/* The decoder. The only state we preserve is our read position,
4413 * i.e. how many characters we have consumed. So if we end in the
4414 * middle of a shift sequence we have to back off the read position
4415 * and the output to the beginning of the sequence, otherwise we lose
4416 * all the shift state (seen bits, number of bits seen, high
4417 * surrogate). */
4418
Alexander Belopolsky40018472011-02-26 01:02:56 +00004419PyObject *
4420PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004421 Py_ssize_t size,
4422 const char *errors,
4423 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004426 Py_ssize_t startinpos;
4427 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 const char *errmsg = "";
4431 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 unsigned int base64bits = 0;
4434 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004435 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 PyObject *errorHandler = NULL;
4437 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004439 if (size == 0) {
4440 if (consumed)
4441 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004442 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004446 _PyUnicodeWriter_Init(&writer);
4447 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448
4449 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450 e = s + size;
4451
4452 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004453 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004455 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 if (inShift) { /* in a base-64 section */
4458 if (IS_BASE64(ch)) { /* consume a base-64 character */
4459 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4460 base64bits += 6;
4461 s++;
4462 if (base64bits >= 16) {
4463 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004464 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 base64bits -= 16;
4466 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004467 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 if (surrogate) {
4469 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004470 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4471 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004472 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004475 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 }
4477 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004478 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004479 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481 }
4482 }
Victor Stinner551ac952011-11-29 22:58:13 +01004483 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 /* first surrogate */
4485 surrogate = outCh;
4486 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004488 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490 }
4491 }
4492 }
4493 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 if (base64bits > 0) { /* left-over bits */
4496 if (base64bits >= 6) {
4497 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004498 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 errmsg = "partial character in shift sequence";
4500 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 else {
4503 /* Some bits remain; they should be zero */
4504 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004505 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 errmsg = "non-zero padding bits in shift sequence";
4507 goto utf7Error;
4508 }
4509 }
4510 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004511 if (surrogate && DECODE_DIRECT(ch)) {
4512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4513 goto onError;
4514 }
4515 surrogate = 0;
4516 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 /* '-' is absorbed; other terminating
4518 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004519 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 }
4522 }
4523 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 s++; /* consume '+' */
4526 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004528 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004529 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530 }
4531 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004533 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004536 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
4538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004541 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004542 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 else {
4545 startinpos = s-starts;
4546 s++;
4547 errmsg = "unexpected special character";
4548 goto utf7Error;
4549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004553 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 errors, &errorHandler,
4555 "utf7", errmsg,
4556 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 }
4560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* end of string */
4562
4563 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4564 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004565 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (surrogate ||
4567 (base64bits >= 6) ||
4568 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004570 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 errors, &errorHandler,
4572 "utf7", "unterminated shift sequence",
4573 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 goto onError;
4576 if (s < e)
4577 goto restart;
4578 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580
4581 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004582 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004584 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004585 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004586 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004587 writer.kind, writer.data, shiftOutStart);
4588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
4590 _PyUnicodeWriter_Dealloc(&writer);
4591 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004592 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004593 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004596 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004598 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 Py_XDECREF(errorHandler);
4601 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 Py_XDECREF(errorHandler);
4606 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004607 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 return NULL;
4609}
4610
4611
Alexander Belopolsky40018472011-02-26 01:02:56 +00004612PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004613_PyUnicode_EncodeUTF7(PyObject *str,
4614 int base64SetO,
4615 int base64WhiteSpace,
4616 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004618 int kind;
4619 void *data;
4620 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004623 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 unsigned int base64bits = 0;
4625 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 char * out;
4627 char * start;
4628
Benjamin Petersonbac79492012-01-14 13:34:47 -05004629 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004630 return NULL;
4631 kind = PyUnicode_KIND(str);
4632 data = PyUnicode_DATA(str);
4633 len = PyUnicode_GET_LENGTH(str);
4634
4635 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004638 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004639 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004640 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004641 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 if (v == NULL)
4643 return NULL;
4644
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004645 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004646 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004647 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 if (inShift) {
4650 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4651 /* shifting out */
4652 if (base64bits) { /* output remaining bits */
4653 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4654 base64buffer = 0;
4655 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656 }
4657 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 /* Characters not in the BASE64 set implicitly unshift the sequence
4659 so no '-' is required, except if the character is itself a '-' */
4660 if (IS_BASE64(ch) || ch == '-') {
4661 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 *out++ = (char) ch;
4664 }
4665 else {
4666 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004667 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else { /* not in a shift sequence */
4670 if (ch == '+') {
4671 *out++ = '+';
4672 *out++ = '-';
4673 }
4674 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4675 *out++ = (char) ch;
4676 }
4677 else {
4678 *out++ = '+';
4679 inShift = 1;
4680 goto encode_char;
4681 }
4682 }
4683 continue;
4684encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004686 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 /* code first surrogate */
4689 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004690 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 while (base64bits >= 6) {
4692 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4693 base64bits -= 6;
4694 }
4695 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004696 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 base64bits += 16;
4699 base64buffer = (base64buffer << 16) | ch;
4700 while (base64bits >= 6) {
4701 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4702 base64bits -= 6;
4703 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004704 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 if (base64bits)
4706 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4707 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004709 if (_PyBytes_Resize(&v, out - start) < 0)
4710 return NULL;
4711 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004713PyObject *
4714PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4715 Py_ssize_t size,
4716 int base64SetO,
4717 int base64WhiteSpace,
4718 const char *errors)
4719{
4720 PyObject *result;
4721 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4722 if (tmp == NULL)
4723 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004724 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004725 base64WhiteSpace, errors);
4726 Py_DECREF(tmp);
4727 return result;
4728}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730#undef IS_BASE64
4731#undef FROM_BASE64
4732#undef TO_BASE64
4733#undef DECODE_DIRECT
4734#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736/* --- UTF-8 Codec -------------------------------------------------------- */
4737
Alexander Belopolsky40018472011-02-26 01:02:56 +00004738PyObject *
4739PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004740 Py_ssize_t size,
4741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742{
Walter Dörwald69652032004-09-07 20:24:22 +00004743 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4744}
4745
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004746#include "stringlib/asciilib.h"
4747#include "stringlib/codecs.h"
4748#include "stringlib/undef.h"
4749
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004750#include "stringlib/ucs1lib.h"
4751#include "stringlib/codecs.h"
4752#include "stringlib/undef.h"
4753
4754#include "stringlib/ucs2lib.h"
4755#include "stringlib/codecs.h"
4756#include "stringlib/undef.h"
4757
4758#include "stringlib/ucs4lib.h"
4759#include "stringlib/codecs.h"
4760#include "stringlib/undef.h"
4761
Antoine Pitrouab868312009-01-10 15:40:25 +00004762/* Mask to quickly check whether a C 'long' contains a
4763 non-ASCII, UTF8-encoded char. */
4764#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004765# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004766#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004767# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004768#else
4769# error C 'long' size should be either 4 or 8!
4770#endif
4771
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772static Py_ssize_t
4773ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004774{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004776 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004778 /*
4779 * Issue #17237: m68k is a bit different from most architectures in
4780 * that objects do not use "natural alignment" - for example, int and
4781 * long are only aligned at 2-byte boundaries. Therefore the assert()
4782 * won't work; also, tests have shown that skipping the "optimised
4783 * version" will even speed up m68k.
4784 */
4785#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004787 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4788 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004789 /* Fast path, see in STRINGLIB(utf8_decode) for
4790 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004791 /* Help allocation */
4792 const char *_p = p;
4793 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 while (_p < aligned_end) {
4795 unsigned long value = *(const unsigned long *) _p;
4796 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 *((unsigned long *)q) = value;
4799 _p += SIZEOF_LONG;
4800 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 p = _p;
4803 while (p < end) {
4804 if ((unsigned char)*p & 0x80)
4805 break;
4806 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004811#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004812 while (p < end) {
4813 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4814 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004816 /* Help allocation */
4817 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 while (_p < aligned_end) {
4819 unsigned long value = *(unsigned long *) _p;
4820 if (value & ASCII_CHAR_MASK)
4821 break;
4822 _p += SIZEOF_LONG;
4823 }
4824 p = _p;
4825 if (_p == end)
4826 break;
4827 }
4828 if ((unsigned char)*p & 0x80)
4829 break;
4830 ++p;
4831 }
4832 memcpy(dest, start, p - start);
4833 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
Antoine Pitrouab868312009-01-10 15:40:25 +00004835
Victor Stinner785938e2011-12-11 20:09:03 +01004836PyObject *
4837PyUnicode_DecodeUTF8Stateful(const char *s,
4838 Py_ssize_t size,
4839 const char *errors,
4840 Py_ssize_t *consumed)
4841{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004842 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004843 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845
4846 Py_ssize_t startinpos;
4847 Py_ssize_t endinpos;
4848 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004849 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004851 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004852
4853 if (size == 0) {
4854 if (consumed)
4855 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004856 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004857 }
4858
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4860 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004861 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 *consumed = 1;
4863 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004864 }
4865
Victor Stinner8f674cc2013-04-17 23:02:17 +02004866 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004867 writer.min_length = size;
4868 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004869 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004870
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 writer.pos = ascii_decode(s, end, writer.data);
4872 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 while (s < end) {
4874 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004875 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004876
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004878 if (PyUnicode_IS_ASCII(writer.buffer))
4879 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004881 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004883 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 } else {
4885 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 }
4888
4889 switch (ch) {
4890 case 0:
4891 if (s == end || consumed)
4892 goto End;
4893 errmsg = "unexpected end of data";
4894 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004895 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 break;
4897 case 1:
4898 errmsg = "invalid start byte";
4899 startinpos = s - starts;
4900 endinpos = startinpos + 1;
4901 break;
4902 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004903 case 3:
4904 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 errmsg = "invalid continuation byte";
4906 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004907 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 break;
4909 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004910 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 goto onError;
4912 continue;
4913 }
4914
Victor Stinner1d65d912015-10-05 13:43:50 +02004915 if (error_handler == _Py_ERROR_UNKNOWN)
4916 error_handler = get_error_handler(errors);
4917
4918 switch (error_handler) {
4919 case _Py_ERROR_IGNORE:
4920 s += (endinpos - startinpos);
4921 break;
4922
4923 case _Py_ERROR_REPLACE:
4924 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4925 goto onError;
4926 s += (endinpos - startinpos);
4927 break;
4928
4929 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004930 {
4931 Py_ssize_t i;
4932
Victor Stinner1d65d912015-10-05 13:43:50 +02004933 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4934 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004935 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004936 ch = (Py_UCS4)(unsigned char)(starts[i]);
4937 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4938 ch + 0xdc00);
4939 writer.pos++;
4940 }
4941 s += (endinpos - startinpos);
4942 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004943 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004944
4945 default:
4946 if (unicode_decode_call_errorhandler_writer(
4947 errors, &error_handler_obj,
4948 "utf-8", errmsg,
4949 &starts, &end, &startinpos, &endinpos, &exc, &s,
4950 &writer))
4951 goto onError;
4952 }
Victor Stinner785938e2011-12-11 20:09:03 +01004953 }
4954
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 if (consumed)
4957 *consumed = s - starts;
4958
Victor Stinner1d65d912015-10-05 13:43:50 +02004959 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004961 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004962
4963onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004964 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004968}
4969
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004970#ifdef __APPLE__
4971
4972/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004973 used to decode the command line arguments on Mac OS X.
4974
4975 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004976 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004977
4978wchar_t*
4979_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4980{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004981 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 wchar_t *unicode;
4983 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004984
4985 /* Note: size will always be longer than the resulting Unicode
4986 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004987 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004988 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004989 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004990 if (!unicode)
4991 return NULL;
4992
4993 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004994 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005000#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 if (ch > 0xFF) {
5004#if SIZEOF_WCHAR_T == 4
5005 assert(0);
5006#else
5007 assert(Py_UNICODE_IS_SURROGATE(ch));
5008 /* compute and append the two surrogates: */
5009 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5010 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5011#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005012 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 else {
5014 if (!ch && s == e)
5015 break;
5016 /* surrogateescape */
5017 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5018 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005019 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005020 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005021 return unicode;
5022}
5023
5024#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026/* Primary internal function which creates utf8 encoded bytes objects.
5027
5028 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005029 and allocate exactly as much space needed at the end. Else allocate the
5030 maximum possible needed (4 result bytes per Unicode character), and return
5031 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005032*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005033PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005034_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035{
Victor Stinner6099a032011-12-18 14:22:26 +01005036 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 void *data;
5038 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040 if (!PyUnicode_Check(unicode)) {
5041 PyErr_BadArgument();
5042 return NULL;
5043 }
5044
5045 if (PyUnicode_READY(unicode) == -1)
5046 return NULL;
5047
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005048 if (PyUnicode_UTF8(unicode))
5049 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5050 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005051
5052 kind = PyUnicode_KIND(unicode);
5053 data = PyUnicode_DATA(unicode);
5054 size = PyUnicode_GET_LENGTH(unicode);
5055
Benjamin Petersonead6b532011-12-20 17:23:42 -06005056 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005057 default:
5058 assert(0);
5059 case PyUnicode_1BYTE_KIND:
5060 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5061 assert(!PyUnicode_IS_ASCII(unicode));
5062 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5063 case PyUnicode_2BYTE_KIND:
5064 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5065 case PyUnicode_4BYTE_KIND:
5066 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068}
5069
Alexander Belopolsky40018472011-02-26 01:02:56 +00005070PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5072 Py_ssize_t size,
5073 const char *errors)
5074{
5075 PyObject *v, *unicode;
5076
5077 unicode = PyUnicode_FromUnicode(s, size);
5078 if (unicode == NULL)
5079 return NULL;
5080 v = _PyUnicode_AsUTF8String(unicode, errors);
5081 Py_DECREF(unicode);
5082 return v;
5083}
5084
5085PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005086PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005088 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089}
5090
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091/* --- UTF-32 Codec ------------------------------------------------------- */
5092
5093PyObject *
5094PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 Py_ssize_t size,
5096 const char *errors,
5097 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098{
5099 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5100}
5101
5102PyObject *
5103PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 Py_ssize_t size,
5105 const char *errors,
5106 int *byteorder,
5107 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108{
5109 const char *starts = s;
5110 Py_ssize_t startinpos;
5111 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005112 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005113 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005114 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005115 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 PyObject *errorHandler = NULL;
5118 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005119
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120 q = (unsigned char *)s;
5121 e = q + size;
5122
5123 if (byteorder)
5124 bo = *byteorder;
5125
5126 /* Check for BOM marks (U+FEFF) in the input and adjust current
5127 byte order setting accordingly. In native mode, the leading BOM
5128 mark is skipped, in all other modes, it is copied to the output
5129 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005130 if (bo == 0 && size >= 4) {
5131 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5132 if (bom == 0x0000FEFF) {
5133 bo = -1;
5134 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005136 else if (bom == 0xFFFE0000) {
5137 bo = 1;
5138 q += 4;
5139 }
5140 if (byteorder)
5141 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142 }
5143
Victor Stinnere64322e2012-10-30 23:12:47 +01005144 if (q == e) {
5145 if (consumed)
5146 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005147 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148 }
5149
Victor Stinnere64322e2012-10-30 23:12:47 +01005150#ifdef WORDS_BIGENDIAN
5151 le = bo < 0;
5152#else
5153 le = bo <= 0;
5154#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005155 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005156
Victor Stinner8f674cc2013-04-17 23:02:17 +02005157 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005158 writer.min_length = (e - q + 3) / 4;
5159 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005160 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005161
Victor Stinnere64322e2012-10-30 23:12:47 +01005162 while (1) {
5163 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005164 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005165
Victor Stinnere64322e2012-10-30 23:12:47 +01005166 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 enum PyUnicode_Kind kind = writer.kind;
5168 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005169 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005170 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005171 if (le) {
5172 do {
5173 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5174 if (ch > maxch)
5175 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 if (kind != PyUnicode_1BYTE_KIND &&
5177 Py_UNICODE_IS_SURROGATE(ch))
5178 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005179 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005180 q += 4;
5181 } while (q <= last);
5182 }
5183 else {
5184 do {
5185 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5186 if (ch > maxch)
5187 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005188 if (kind != PyUnicode_1BYTE_KIND &&
5189 Py_UNICODE_IS_SURROGATE(ch))
5190 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005191 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005192 q += 4;
5193 } while (q <= last);
5194 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005195 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005196 }
5197
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005198 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005199 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005200 startinpos = ((const char *)q) - starts;
5201 endinpos = startinpos + 4;
5202 }
5203 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005204 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005206 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005208 startinpos = ((const char *)q) - starts;
5209 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005210 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005211 else {
5212 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005213 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005214 goto onError;
5215 q += 4;
5216 continue;
5217 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005218 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005219 startinpos = ((const char *)q) - starts;
5220 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005222
5223 /* The remaining input chars are ignored if the callback
5224 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005225 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005231 }
5232
Walter Dörwald41980ca2007-08-16 21:55:45 +00005233 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236 Py_XDECREF(errorHandler);
5237 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005238 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005239
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 Py_XDECREF(errorHandler);
5243 Py_XDECREF(exc);
5244 return NULL;
5245}
5246
5247PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248_PyUnicode_EncodeUTF32(PyObject *str,
5249 const char *errors,
5250 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005252 enum PyUnicode_Kind kind;
5253 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005254 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005255 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005256 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005257#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005258 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005260 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005261#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005262 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005263 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005264 PyObject *errorHandler = NULL;
5265 PyObject *exc = NULL;
5266 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005268 if (!PyUnicode_Check(str)) {
5269 PyErr_BadArgument();
5270 return NULL;
5271 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005272 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005273 return NULL;
5274 kind = PyUnicode_KIND(str);
5275 data = PyUnicode_DATA(str);
5276 len = PyUnicode_GET_LENGTH(str);
5277
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005278 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005279 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005280 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005281 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282 if (v == NULL)
5283 return NULL;
5284
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005285 /* output buffer is 4-bytes aligned */
5286 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5287 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005289 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005290 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005291 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005293 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005294 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005295 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005296 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005297 else
5298 encoding = "utf-32";
5299
5300 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005301 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5302 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005303 }
5304
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005305 pos = 0;
5306 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005307 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005308
5309 if (kind == PyUnicode_2BYTE_KIND) {
5310 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5311 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005312 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005313 else {
5314 assert(kind == PyUnicode_4BYTE_KIND);
5315 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5316 &out, native_ordering);
5317 }
5318 if (pos == len)
5319 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005320
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005321 rep = unicode_encode_call_errorhandler(
5322 errors, &errorHandler,
5323 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005324 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 if (!rep)
5326 goto error;
5327
5328 if (PyBytes_Check(rep)) {
5329 repsize = PyBytes_GET_SIZE(rep);
5330 if (repsize & 3) {
5331 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005332 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 "surrogates not allowed");
5334 goto error;
5335 }
5336 moreunits = repsize / 4;
5337 }
5338 else {
5339 assert(PyUnicode_Check(rep));
5340 if (PyUnicode_READY(rep) < 0)
5341 goto error;
5342 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5343 if (!PyUnicode_IS_ASCII(rep)) {
5344 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005345 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005346 "surrogates not allowed");
5347 goto error;
5348 }
5349 }
5350
5351 /* four bytes are reserved for each surrogate */
5352 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 Py_ssize_t morebytes = 4 * (moreunits - 1);
5355 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5356 /* integer overflow */
5357 PyErr_NoMemory();
5358 goto error;
5359 }
5360 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5361 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005362 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005363 }
5364
5365 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005366 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5367 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005368 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005369 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5371 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005372 }
5373
5374 Py_CLEAR(rep);
5375 }
5376
5377 /* Cut back to size actually needed. This is necessary for, for example,
5378 encoding of a string containing isolated surrogates and the 'ignore'
5379 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005381 if (nsize != PyBytes_GET_SIZE(v))
5382 _PyBytes_Resize(&v, nsize);
5383 Py_XDECREF(errorHandler);
5384 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005385 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005386 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005387 error:
5388 Py_XDECREF(rep);
5389 Py_XDECREF(errorHandler);
5390 Py_XDECREF(exc);
5391 Py_XDECREF(v);
5392 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005393}
5394
Alexander Belopolsky40018472011-02-26 01:02:56 +00005395PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5397 Py_ssize_t size,
5398 const char *errors,
5399 int byteorder)
5400{
5401 PyObject *result;
5402 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5403 if (tmp == NULL)
5404 return NULL;
5405 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5406 Py_DECREF(tmp);
5407 return result;
5408}
5409
5410PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005411PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005412{
Victor Stinnerb960b342011-11-20 19:12:52 +01005413 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005414}
5415
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416/* --- UTF-16 Codec ------------------------------------------------------- */
5417
Tim Peters772747b2001-08-09 22:21:55 +00005418PyObject *
5419PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 Py_ssize_t size,
5421 const char *errors,
5422 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
Walter Dörwald69652032004-09-07 20:24:22 +00005424 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5425}
5426
5427PyObject *
5428PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 Py_ssize_t size,
5430 const char *errors,
5431 int *byteorder,
5432 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005433{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005435 Py_ssize_t startinpos;
5436 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005438 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005439 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005440 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005441 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 PyObject *errorHandler = NULL;
5443 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445
Tim Peters772747b2001-08-09 22:21:55 +00005446 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005447 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448
5449 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005450 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005452 /* Check for BOM marks (U+FEFF) in the input and adjust current
5453 byte order setting accordingly. In native mode, the leading BOM
5454 mark is skipped, in all other modes, it is copied to the output
5455 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005456 if (bo == 0 && size >= 2) {
5457 const Py_UCS4 bom = (q[1] << 8) | q[0];
5458 if (bom == 0xFEFF) {
5459 q += 2;
5460 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005462 else if (bom == 0xFFFE) {
5463 q += 2;
5464 bo = 1;
5465 }
5466 if (byteorder)
5467 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
Antoine Pitrou63065d72012-05-15 23:48:04 +02005470 if (q == e) {
5471 if (consumed)
5472 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005473 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005474 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005475
Christian Heimes743e0cd2012-10-17 23:52:17 +02005476#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005477 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005478 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005479#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005480 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005482#endif
Tim Peters772747b2001-08-09 22:21:55 +00005483
Antoine Pitrou63065d72012-05-15 23:48:04 +02005484 /* Note: size will always be longer than the resulting Unicode
5485 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005486 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005487 writer.min_length = (e - q + 1) / 2;
5488 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005489 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005490
Antoine Pitrou63065d72012-05-15 23:48:04 +02005491 while (1) {
5492 Py_UCS4 ch = 0;
5493 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005494 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005495 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005496 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005497 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005498 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005499 native_ordering);
5500 else
5501 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005502 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005503 native_ordering);
5504 } else if (kind == PyUnicode_2BYTE_KIND) {
5505 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005506 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005507 native_ordering);
5508 } else {
5509 assert(kind == PyUnicode_4BYTE_KIND);
5510 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005511 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005512 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005513 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005514 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515
Antoine Pitrou63065d72012-05-15 23:48:04 +02005516 switch (ch)
5517 {
5518 case 0:
5519 /* remaining byte at the end? (size should be even) */
5520 if (q == e || consumed)
5521 goto End;
5522 errmsg = "truncated data";
5523 startinpos = ((const char *)q) - starts;
5524 endinpos = ((const char *)e) - starts;
5525 break;
5526 /* The remaining input chars are ignored if the callback
5527 chooses to skip the input */
5528 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005529 q -= 2;
5530 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005531 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005532 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005533 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005534 endinpos = ((const char *)e) - starts;
5535 break;
5536 case 2:
5537 errmsg = "illegal encoding";
5538 startinpos = ((const char *)q) - 2 - starts;
5539 endinpos = startinpos + 2;
5540 break;
5541 case 3:
5542 errmsg = "illegal UTF-16 surrogate";
5543 startinpos = ((const char *)q) - 4 - starts;
5544 endinpos = startinpos + 2;
5545 break;
5546 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005547 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005548 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005549 continue;
5550 }
5551
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005552 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005553 errors,
5554 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005556 &starts,
5557 (const char **)&e,
5558 &startinpos,
5559 &endinpos,
5560 &exc,
5561 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005562 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 }
5565
Antoine Pitrou63065d72012-05-15 23:48:04 +02005566End:
Walter Dörwald69652032004-09-07 20:24:22 +00005567 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 Py_XDECREF(errorHandler);
5571 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005572 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005575 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576 Py_XDECREF(errorHandler);
5577 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 return NULL;
5579}
5580
Tim Peters772747b2001-08-09 22:21:55 +00005581PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005582_PyUnicode_EncodeUTF16(PyObject *str,
5583 const char *errors,
5584 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005586 enum PyUnicode_Kind kind;
5587 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005588 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005589 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005590 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005591 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005592#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005593 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005594#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005595 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005596#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005597 const char *encoding;
5598 Py_ssize_t nsize, pos;
5599 PyObject *errorHandler = NULL;
5600 PyObject *exc = NULL;
5601 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005602
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603 if (!PyUnicode_Check(str)) {
5604 PyErr_BadArgument();
5605 return NULL;
5606 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005607 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005608 return NULL;
5609 kind = PyUnicode_KIND(str);
5610 data = PyUnicode_DATA(str);
5611 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005612
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005613 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005614 if (kind == PyUnicode_4BYTE_KIND) {
5615 const Py_UCS4 *in = (const Py_UCS4 *)data;
5616 const Py_UCS4 *end = in + len;
5617 while (in < end)
5618 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005620 }
5621 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005623 nsize = len + pairs + (byteorder == 0);
5624 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 if (v == NULL)
5626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005628 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005629 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005630 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005632 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005634 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005635
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005636 if (kind == PyUnicode_1BYTE_KIND) {
5637 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5638 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005639 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005640
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005641 if (byteorder < 0)
5642 encoding = "utf-16-le";
5643 else if (byteorder > 0)
5644 encoding = "utf-16-be";
5645 else
5646 encoding = "utf-16";
5647
5648 pos = 0;
5649 while (pos < len) {
5650 Py_ssize_t repsize, moreunits;
5651
5652 if (kind == PyUnicode_2BYTE_KIND) {
5653 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5654 &out, native_ordering);
5655 }
5656 else {
5657 assert(kind == PyUnicode_4BYTE_KIND);
5658 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5659 &out, native_ordering);
5660 }
5661 if (pos == len)
5662 break;
5663
5664 rep = unicode_encode_call_errorhandler(
5665 errors, &errorHandler,
5666 encoding, "surrogates not allowed",
5667 str, &exc, pos, pos + 1, &pos);
5668 if (!rep)
5669 goto error;
5670
5671 if (PyBytes_Check(rep)) {
5672 repsize = PyBytes_GET_SIZE(rep);
5673 if (repsize & 1) {
5674 raise_encode_exception(&exc, encoding,
5675 str, pos - 1, pos,
5676 "surrogates not allowed");
5677 goto error;
5678 }
5679 moreunits = repsize / 2;
5680 }
5681 else {
5682 assert(PyUnicode_Check(rep));
5683 if (PyUnicode_READY(rep) < 0)
5684 goto error;
5685 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5686 if (!PyUnicode_IS_ASCII(rep)) {
5687 raise_encode_exception(&exc, encoding,
5688 str, pos - 1, pos,
5689 "surrogates not allowed");
5690 goto error;
5691 }
5692 }
5693
5694 /* two bytes are reserved for each surrogate */
5695 if (moreunits > 1) {
5696 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5697 Py_ssize_t morebytes = 2 * (moreunits - 1);
5698 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5699 /* integer overflow */
5700 PyErr_NoMemory();
5701 goto error;
5702 }
5703 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5704 goto error;
5705 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5706 }
5707
5708 if (PyBytes_Check(rep)) {
5709 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5710 out += moreunits;
5711 } else /* rep is unicode */ {
5712 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5713 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5714 &out, native_ordering);
5715 }
5716
5717 Py_CLEAR(rep);
5718 }
5719
5720 /* Cut back to size actually needed. This is necessary for, for example,
5721 encoding of a string containing isolated surrogates and the 'ignore' handler
5722 is used. */
5723 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5724 if (nsize != PyBytes_GET_SIZE(v))
5725 _PyBytes_Resize(&v, nsize);
5726 Py_XDECREF(errorHandler);
5727 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005728 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005729 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 error:
5731 Py_XDECREF(rep);
5732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
5734 Py_XDECREF(v);
5735 return NULL;
5736#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737}
5738
Alexander Belopolsky40018472011-02-26 01:02:56 +00005739PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5741 Py_ssize_t size,
5742 const char *errors,
5743 int byteorder)
5744{
5745 PyObject *result;
5746 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5747 if (tmp == NULL)
5748 return NULL;
5749 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5750 Py_DECREF(tmp);
5751 return result;
5752}
5753
5754PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005755PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
5760/* --- Unicode Escape Codec ----------------------------------------------- */
5761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005762/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5763 if all the escapes in the string make it still a valid ASCII string.
5764 Returns -1 if any escapes were found which cause the string to
5765 pop out of ASCII range. Otherwise returns the length of the
5766 required buffer to hold the string.
5767 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005768static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5770{
5771 const unsigned char *p = (const unsigned char *)s;
5772 const unsigned char *end = p + size;
5773 Py_ssize_t length = 0;
5774
5775 if (size < 0)
5776 return -1;
5777
5778 for (; p < end; ++p) {
5779 if (*p > 127) {
5780 /* Non-ASCII */
5781 return -1;
5782 }
5783 else if (*p != '\\') {
5784 /* Normal character */
5785 ++length;
5786 }
5787 else {
5788 /* Backslash-escape, check next char */
5789 ++p;
5790 /* Escape sequence reaches till end of string or
5791 non-ASCII follow-up. */
5792 if (p >= end || *p > 127)
5793 return -1;
5794 switch (*p) {
5795 case '\n':
5796 /* backslash + \n result in zero characters */
5797 break;
5798 case '\\': case '\'': case '\"':
5799 case 'b': case 'f': case 't':
5800 case 'n': case 'r': case 'v': case 'a':
5801 ++length;
5802 break;
5803 case '0': case '1': case '2': case '3':
5804 case '4': case '5': case '6': case '7':
5805 case 'x': case 'u': case 'U': case 'N':
5806 /* these do not guarantee ASCII characters */
5807 return -1;
5808 default:
5809 /* count the backslash + the other character */
5810 length += 2;
5811 }
5812 }
5813 }
5814 return length;
5815}
5816
Fredrik Lundh06d12682001-01-24 07:59:11 +00005817static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005818
Alexander Belopolsky40018472011-02-26 01:02:56 +00005819PyObject *
5820PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005821 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005825 Py_ssize_t startinpos;
5826 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005827 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 char* message;
5830 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 PyObject *errorHandler = NULL;
5832 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005833 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005834
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005836 if (len == 0)
5837 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838
5839 /* After length_of_escaped_ascii_string() there are two alternatives,
5840 either the string is pure ASCII with named escapes like \n, etc.
5841 and we determined it's exact size (common case)
5842 or it contains \x, \u, ... escape sequences. then we create a
5843 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005844 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005846 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005847 }
5848 else {
5849 /* Escaped strings will always be longer than the resulting
5850 Unicode string, so we start with size here and then reduce the
5851 length after conversion to the true value.
5852 (but if the error callback returns a long replacement string
5853 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005854 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 }
5856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005860
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 while (s < end) {
5862 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005863 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
5866 /* Non-escape characters are interpreted as Unicode ordinals */
5867 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005868 x = (unsigned char)*s;
5869 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005870 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005871 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 continue;
5873 }
5874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 /* \ - Escapes */
5877 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005878 c = *s++;
5879 if (s > end)
5880 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005882 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005885#define WRITECHAR(ch) \
5886 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005887 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005888 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005890
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 case '\\': WRITECHAR('\\'); break;
5893 case '\'': WRITECHAR('\''); break;
5894 case '\"': WRITECHAR('\"'); break;
5895 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005896 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005897 case 'f': WRITECHAR('\014'); break;
5898 case 't': WRITECHAR('\t'); break;
5899 case 'n': WRITECHAR('\n'); break;
5900 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005901 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005902 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005903 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 case '0': case '1': case '2': case '3':
5908 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005909 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005910 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005911 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005912 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005913 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005915 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 break;
5917
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 /* hex escapes */
5919 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005921 digits = 2;
5922 message = "truncated \\xXX escape";
5923 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005927 digits = 4;
5928 message = "truncated \\uXXXX escape";
5929 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005932 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933 digits = 8;
5934 message = "truncated \\UXXXXXXXX escape";
5935 hexescape:
5936 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005937 if (end - s < digits) {
5938 /* count only hex digits */
5939 for (; s < end; ++s) {
5940 c = (unsigned char)*s;
5941 if (!Py_ISXDIGIT(c))
5942 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005943 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005944 goto error;
5945 }
5946 for (; digits--; ++s) {
5947 c = (unsigned char)*s;
5948 if (!Py_ISXDIGIT(c))
5949 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005950 chr = (chr<<4) & ~0xF;
5951 if (c >= '0' && c <= '9')
5952 chr += c - '0';
5953 else if (c >= 'a' && c <= 'f')
5954 chr += 10 + c - 'a';
5955 else
5956 chr += 10 + c - 'A';
5957 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005958 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005959 /* _decoding_error will have already written into the
5960 target buffer. */
5961 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005962 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005963 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005964 message = "illegal Unicode character";
5965 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005966 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005967 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005968 break;
5969
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971 case 'N':
5972 message = "malformed \\N character escape";
5973 if (ucnhash_CAPI == NULL) {
5974 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5976 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005977 if (ucnhash_CAPI == NULL)
5978 goto ucnhashError;
5979 }
5980 if (*s == '{') {
5981 const char *start = s+1;
5982 /* look for the closing brace */
5983 while (*s != '}' && s < end)
5984 s++;
5985 if (s > start && s < end && *s == '}') {
5986 /* found a name. look it up in the unicode database */
5987 message = "unknown Unicode character name";
5988 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005989 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005990 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005991 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005992 goto store;
5993 }
5994 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005995 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005996
5997 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005998 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999 message = "\\ at end of string";
6000 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006001 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006002 }
6003 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006004 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006005 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006006 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006007 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006009 continue;
6010
6011 error:
6012 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006013 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006014 errors, &errorHandler,
6015 "unicodeescape", message,
6016 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006017 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006018 goto onError;
6019 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006021#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006022
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006025 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006028 PyErr_SetString(
6029 PyExc_UnicodeError,
6030 "\\N escapes not supported (can't load unicodedata module)"
6031 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 Py_XDECREF(errorHandler);
6034 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006035 return NULL;
6036
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006038 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return NULL;
6042}
6043
6044/* Return a Unicode-Escape string version of the Unicode object.
6045
6046 If quotes is true, the string is enclosed in u"" or u'' quotes as
6047 appropriate.
6048
6049*/
6050
Alexander Belopolsky40018472011-02-26 01:02:56 +00006051PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 int kind;
6057 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006058 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059
Ezio Melottie7f90372012-10-05 03:33:31 +03006060 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006061 escape.
6062
Ezio Melottie7f90372012-10-05 03:33:31 +03006063 For UCS1 strings it's '\xxx', 4 bytes per source character.
6064 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6065 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006066 */
6067
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006068 if (!PyUnicode_Check(unicode)) {
6069 PyErr_BadArgument();
6070 return NULL;
6071 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006072 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006073 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006074
6075 _PyBytesWriter_Init(&writer);
6076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 len = PyUnicode_GET_LENGTH(unicode);
6078 kind = PyUnicode_KIND(unicode);
6079 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006080
Victor Stinner358af132015-10-12 22:36:57 +02006081 p = _PyBytesWriter_Alloc(&writer, len);
6082 if (p == NULL)
6083 goto error;
6084 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006086 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006087 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006088
Walter Dörwald79e913e2007-05-12 11:08:06 +00006089 /* Escape backslashes */
6090 if (ch == '\\') {
Victor Stinner358af132015-10-12 22:36:57 +02006091 /* -1: substract 1 preallocated byte */
6092 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6093 if (p == NULL)
6094 goto error;
6095
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 *p++ = '\\';
6097 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006098 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006099 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006100
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006101 /* Map 21-bit characters to '\U00xxxxxx' */
6102 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006103 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006104
6105 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6106 if (p == NULL)
6107 goto error;
6108
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006109 *p++ = '\\';
6110 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006111 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6112 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6113 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6114 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6115 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6116 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6117 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6118 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006120 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006121
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006123 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006124 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6125 if (p == NULL)
6126 goto error;
6127
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 *p++ = '\\';
6129 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006130 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6131 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6132 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6133 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006135
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006136 /* Map special whitespace to '\t', \n', '\r' */
6137 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006138 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6139 if (p == NULL)
6140 goto error;
6141
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006142 *p++ = '\\';
6143 *p++ = 't';
6144 }
6145 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006146 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6147 if (p == NULL)
6148 goto error;
6149
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006150 *p++ = '\\';
6151 *p++ = 'n';
6152 }
6153 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006154 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6155 if (p == NULL)
6156 goto error;
6157
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006158 *p++ = '\\';
6159 *p++ = 'r';
6160 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006161
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006162 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006163 else if (ch < ' ' || ch >= 0x7F) {
Victor Stinner358af132015-10-12 22:36:57 +02006164 /* -1: substract 1 preallocated byte */
6165 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6166 if (p == NULL)
6167 goto error;
6168
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006170 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006171 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6172 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006173 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 /* Copy everything else as-is */
6176 else
6177 *p++ = (char) ch;
6178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
Victor Stinner358af132015-10-12 22:36:57 +02006180 return _PyBytesWriter_Finish(&writer, p);
6181
6182error:
6183 _PyBytesWriter_Dealloc(&writer);
6184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185}
6186
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6189 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 PyObject *result;
6192 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6193 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 result = PyUnicode_AsUnicodeEscapeString(tmp);
6196 Py_DECREF(tmp);
6197 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198}
6199
6200/* --- Raw Unicode Escape Codec ------------------------------------------- */
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
6203PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006204 Py_ssize_t size,
6205 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006208 Py_ssize_t startinpos;
6209 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006210 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 const char *end;
6212 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213 PyObject *errorHandler = NULL;
6214 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006215
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006216 if (size == 0)
6217 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006218
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 /* Escaped strings will always be longer than the resulting
6220 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 length after conversion to the true value. (But decoding error
6222 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006223 _PyUnicodeWriter_Init(&writer);
6224 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006225
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 end = s + size;
6227 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 unsigned char c;
6229 Py_UCS4 x;
6230 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006231 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 /* Non-escape characters are interpreted as Unicode ordinals */
6234 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006235 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006236 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006237 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 startinpos = s-starts;
6241
6242 /* \u-escapes are only interpreted iff the number of leading
6243 backslashes if odd */
6244 bs = s;
6245 for (;s < end;) {
6246 if (*s != '\\')
6247 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006248 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006249 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006250 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 }
6252 if (((s - bs) & 1) == 0 ||
6253 s >= end ||
6254 (*s != 'u' && *s != 'U')) {
6255 continue;
6256 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006257 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 count = *s=='u' ? 4 : 8;
6259 s++;
6260
6261 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 for (x = 0, i = 0; i < count; ++i, ++s) {
6263 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006264 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006266 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 errors, &errorHandler,
6268 "rawunicodeescape", "truncated \\uXXXX",
6269 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006270 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 goto onError;
6272 goto nextByte;
6273 }
6274 x = (x<<4) & ~0xF;
6275 if (c >= '0' && c <= '9')
6276 x += c - '0';
6277 else if (c >= 'a' && c <= 'f')
6278 x += 10 + c - 'a';
6279 else
6280 x += 10 + c - 'A';
6281 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006282 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006283 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006284 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006285 }
6286 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006287 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006288 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006289 errors, &errorHandler,
6290 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006292 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006294 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 nextByte:
6296 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 Py_XDECREF(errorHandler);
6299 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006300 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006301
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006303 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 Py_XDECREF(errorHandler);
6305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return NULL;
6307}
6308
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309
Alexander Belopolsky40018472011-02-26 01:02:56 +00006310PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006311PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006314 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315 int kind;
6316 void *data;
6317 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006318 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006320 if (!PyUnicode_Check(unicode)) {
6321 PyErr_BadArgument();
6322 return NULL;
6323 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006324 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006326
6327 _PyBytesWriter_Init(&writer);
6328
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006329 kind = PyUnicode_KIND(unicode);
6330 data = PyUnicode_DATA(unicode);
6331 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006332
Victor Stinner358af132015-10-12 22:36:57 +02006333 p = _PyBytesWriter_Alloc(&writer, len);
6334 if (p == NULL)
6335 goto error;
6336 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006337
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006338 for (pos = 0; pos < len; pos++) {
6339 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* Map 32-bit characters to '\Uxxxxxxxx' */
6341 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006342 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006343
6344 /* -1: substract 1 preallocated byte */
6345 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6346 if (p == NULL)
6347 goto error;
6348
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 *p++ = '\\';
6350 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006351 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6352 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6353 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6354 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6355 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6356 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6357 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6358 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006361 else if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006362 /* -1: substract 1 preallocated byte */
6363 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6364 if (p == NULL)
6365 goto error;
6366
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 *p++ = '\\';
6368 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006369 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6370 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6371 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6372 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 /* Copy everything else as-is */
6375 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 *p++ = (char) ch;
6377 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006378
Victor Stinner358af132015-10-12 22:36:57 +02006379 return _PyBytesWriter_Finish(&writer, p);
6380
6381error:
6382 _PyBytesWriter_Dealloc(&writer);
6383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384}
6385
Alexander Belopolsky40018472011-02-26 01:02:56 +00006386PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006387PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6388 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006390 PyObject *result;
6391 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6392 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006393 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006394 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6395 Py_DECREF(tmp);
6396 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397}
6398
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399/* --- Unicode Internal Codec ------------------------------------------- */
6400
Alexander Belopolsky40018472011-02-26 01:02:56 +00006401PyObject *
6402_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006403 Py_ssize_t size,
6404 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006405{
6406 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t startinpos;
6408 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006409 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006410 const char *end;
6411 const char *reason;
6412 PyObject *errorHandler = NULL;
6413 PyObject *exc = NULL;
6414
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006415 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006416 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006417 1))
6418 return NULL;
6419
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006420 if (size == 0)
6421 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006422
Victor Stinner8f674cc2013-04-17 23:02:17 +02006423 _PyUnicodeWriter_Init(&writer);
6424 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6425 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006427 }
6428 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006429
Victor Stinner8f674cc2013-04-17 23:02:17 +02006430 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006431 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006432 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006433 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006434 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006435 endinpos = end-starts;
6436 reason = "truncated input";
6437 goto error;
6438 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006439 /* We copy the raw representation one byte at a time because the
6440 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006441 ((char *) &uch)[0] = s[0];
6442 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006443#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006444 ((char *) &uch)[2] = s[2];
6445 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006446#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006447 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006448#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006449 /* We have to sanity check the raw data, otherwise doom looms for
6450 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006451 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006452 endinpos = s - starts + Py_UNICODE_SIZE;
6453 reason = "illegal code point (> 0x10FFFF)";
6454 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006455 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006456#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006457 s += Py_UNICODE_SIZE;
6458#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006459 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006460 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006461 Py_UNICODE uch2;
6462 ((char *) &uch2)[0] = s[0];
6463 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006464 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006465 {
Victor Stinner551ac952011-11-29 22:58:13 +01006466 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006467 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006468 }
6469 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006470#endif
6471
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006472 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006473 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006474 continue;
6475
6476 error:
6477 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006478 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006479 errors, &errorHandler,
6480 "unicode_internal", reason,
6481 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006482 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006483 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006484 }
6485
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006486 Py_XDECREF(errorHandler);
6487 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006488 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006489
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006491 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006492 Py_XDECREF(errorHandler);
6493 Py_XDECREF(exc);
6494 return NULL;
6495}
6496
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497/* --- Latin-1 Codec ------------------------------------------------------ */
6498
Alexander Belopolsky40018472011-02-26 01:02:56 +00006499PyObject *
6500PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006501 Py_ssize_t size,
6502 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006505 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506}
6507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006509static void
6510make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006511 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006512 PyObject *unicode,
6513 Py_ssize_t startpos, Py_ssize_t endpos,
6514 const char *reason)
6515{
6516 if (*exceptionObject == NULL) {
6517 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006519 encoding, unicode, startpos, endpos, reason);
6520 }
6521 else {
6522 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6523 goto onError;
6524 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6525 goto onError;
6526 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6527 goto onError;
6528 return;
6529 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006530 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006531 }
6532}
6533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006535static void
6536raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006537 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006538 PyObject *unicode,
6539 Py_ssize_t startpos, Py_ssize_t endpos,
6540 const char *reason)
6541{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006542 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006543 encoding, unicode, startpos, endpos, reason);
6544 if (*exceptionObject != NULL)
6545 PyCodec_StrictErrors(*exceptionObject);
6546}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547
6548/* error handling callback helper:
6549 build arguments, call the callback and check the arguments,
6550 put the result into newpos and return the replacement string, which
6551 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006552static PyObject *
6553unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006554 PyObject **errorHandler,
6555 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006557 Py_ssize_t startpos, Py_ssize_t endpos,
6558 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006560 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 PyObject *restuple;
6563 PyObject *resunicode;
6564
6565 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 }
6570
Benjamin Petersonbac79492012-01-14 13:34:47 -05006571 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 return NULL;
6573 len = PyUnicode_GET_LENGTH(unicode);
6574
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006575 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579
6580 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006582 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006585 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 Py_DECREF(restuple);
6587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006589 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 &resunicode, newpos)) {
6591 Py_DECREF(restuple);
6592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006594 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6595 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6596 Py_DECREF(restuple);
6597 return NULL;
6598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006600 *newpos = len + *newpos;
6601 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006602 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 Py_DECREF(restuple);
6604 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006605 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 Py_INCREF(resunicode);
6607 Py_DECREF(restuple);
6608 return resunicode;
6609}
6610
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006614 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 /* input state */
6617 Py_ssize_t pos=0, size;
6618 int kind;
6619 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 /* pointer into the output */
6621 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006622 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6623 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006624 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006626 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006627 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006628 /* output object */
6629 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630
Benjamin Petersonbac79492012-01-14 13:34:47 -05006631 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006632 return NULL;
6633 size = PyUnicode_GET_LENGTH(unicode);
6634 kind = PyUnicode_KIND(unicode);
6635 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 /* allocate enough for a simple encoding without
6637 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006638 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006639 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006640
6641 _PyBytesWriter_Init(&writer);
6642 str = _PyBytesWriter_Alloc(&writer, size);
6643 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006644 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006645
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006647 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006650 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006652 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006654 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006656 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006659 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006661
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006662 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006663 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006664
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006665 /* Only overallocate the buffer if it's not the last write */
6666 writer.overallocate = (collend < size);
6667
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006669 if (error_handler == _Py_ERROR_UNKNOWN)
6670 error_handler = get_error_handler(errors);
6671
6672 switch (error_handler) {
6673 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006674 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006676
6677 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006678 memset(str, '?', collend - collstart);
6679 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006680 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006681 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 break;
Victor Stinner50149202015-09-22 00:26:54 +02006684
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006685 case _Py_ERROR_BACKSLASHREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006686 /* substract preallocated bytes */
6687 writer.min_size -= (collend - collstart);
6688 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006689 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006690 if (str == NULL)
6691 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006692 pos = collend;
6693 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006694
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006695 case _Py_ERROR_XMLCHARREFREPLACE:
Victor Stinnerad771582015-10-09 12:38:53 +02006696 /* substract preallocated bytes */
6697 writer.min_size -= (collend - collstart);
6698 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006699 unicode, collstart, collend);
6700 if (str == NULL)
6701 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 break;
Victor Stinner50149202015-09-22 00:26:54 +02006704
Victor Stinnerc3713e92015-09-29 12:32:13 +02006705 case _Py_ERROR_SURROGATEESCAPE:
6706 for (i = collstart; i < collend; ++i) {
6707 ch = PyUnicode_READ(kind, data, i);
6708 if (ch < 0xdc80 || 0xdcff < ch) {
6709 /* Not a UTF-8b surrogate */
6710 break;
6711 }
6712 *str++ = (char)(ch - 0xdc00);
6713 ++pos;
6714 }
6715 if (i >= collend)
6716 break;
6717 collstart = pos;
6718 assert(collstart != collend);
6719 /* fallback to general error handling */
6720
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006722 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6723 encoding, reason, unicode, &exc,
6724 collstart, collend, &newpos);
6725 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006727
Victor Stinnerad771582015-10-09 12:38:53 +02006728 /* substract preallocated bytes */
6729 writer.min_size -= 1;
6730
Victor Stinner6bd525b2015-10-09 13:10:05 +02006731 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006732 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006733 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006734 PyBytes_AS_STRING(rep),
6735 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006736 if (str == NULL)
6737 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006738 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006739 else {
6740 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006741
Victor Stinner6bd525b2015-10-09 13:10:05 +02006742 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006744
6745 if (PyUnicode_IS_ASCII(rep)) {
6746 /* Fast path: all characters are smaller than limit */
6747 assert(limit >= 128);
6748 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6749 str = _PyBytesWriter_WriteBytes(&writer, str,
6750 PyUnicode_DATA(rep),
6751 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006753 else {
6754 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6755
6756 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6757 if (str == NULL)
6758 goto onError;
6759
6760 /* check if there is anything unencodable in the
6761 replacement and copy it to the output */
6762 for (i = 0; repsize-->0; ++i, ++str) {
6763 ch = PyUnicode_READ_CHAR(rep, i);
6764 if (ch >= limit) {
6765 raise_encode_exception(&exc, encoding, unicode,
6766 pos, pos+1, reason);
6767 goto onError;
6768 }
6769 *str = (char)ch;
6770 }
6771 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006773 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006774 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006775 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006776
6777 /* If overallocation was disabled, ensure that it was the last
6778 write. Otherwise, we missed an optimization */
6779 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006780 }
6781 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006782
Victor Stinner50149202015-09-22 00:26:54 +02006783 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006785 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006786
6787 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006788 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006789 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006790 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006791 Py_XDECREF(exc);
6792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793}
6794
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006795/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006796PyObject *
6797PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006798 Py_ssize_t size,
6799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 PyObject *result;
6802 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6803 if (unicode == NULL)
6804 return NULL;
6805 result = unicode_encode_ucs1(unicode, errors, 256);
6806 Py_DECREF(unicode);
6807 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808}
6809
Alexander Belopolsky40018472011-02-26 01:02:56 +00006810PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006811_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812{
6813 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 PyErr_BadArgument();
6815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006817 if (PyUnicode_READY(unicode) == -1)
6818 return NULL;
6819 /* Fast path: if it is a one-byte string, construct
6820 bytes object directly. */
6821 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6822 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6823 PyUnicode_GET_LENGTH(unicode));
6824 /* Non-Latin-1 characters present. Defer to above function to
6825 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006826 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006827}
6828
6829PyObject*
6830PyUnicode_AsLatin1String(PyObject *unicode)
6831{
6832 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
6834
6835/* --- 7-bit ASCII Codec -------------------------------------------------- */
6836
Alexander Belopolsky40018472011-02-26 01:02:56 +00006837PyObject *
6838PyUnicode_DecodeASCII(const char *s,
6839 Py_ssize_t size,
6840 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006843 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006844 int kind;
6845 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006846 Py_ssize_t startinpos;
6847 Py_ssize_t endinpos;
6848 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006850 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006852 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006855 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006858 if (size == 1 && (unsigned char)s[0] < 128)
6859 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006860
Victor Stinner8f674cc2013-04-17 23:02:17 +02006861 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006862 writer.min_length = size;
6863 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006864 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006867 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006868 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006869 writer.pos = outpos;
6870 if (writer.pos == size)
6871 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006872
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006873 s += writer.pos;
6874 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006876 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006878 PyUnicode_WRITE(kind, data, writer.pos, c);
6879 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006881 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006883
6884 /* byte outsize range 0x00..0x7f: call the error handler */
6885
6886 if (error_handler == _Py_ERROR_UNKNOWN)
6887 error_handler = get_error_handler(errors);
6888
6889 switch (error_handler)
6890 {
6891 case _Py_ERROR_REPLACE:
6892 case _Py_ERROR_SURROGATEESCAPE:
6893 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006894 but we may switch to UCS2 at the first write */
6895 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6896 goto onError;
6897 kind = writer.kind;
6898 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006899
6900 if (error_handler == _Py_ERROR_REPLACE)
6901 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6902 else
6903 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6904 writer.pos++;
6905 ++s;
6906 break;
6907
6908 case _Py_ERROR_IGNORE:
6909 ++s;
6910 break;
6911
6912 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 startinpos = s-starts;
6914 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006915 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006916 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 "ascii", "ordinal not in range(128)",
6918 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006919 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006921 kind = writer.kind;
6922 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006925 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006927 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006928
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006930 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006931 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 return NULL;
6934}
6935
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006936/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006937PyObject *
6938PyUnicode_EncodeASCII(const Py_UNICODE *p,
6939 Py_ssize_t size,
6940 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006942 PyObject *result;
6943 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6944 if (unicode == NULL)
6945 return NULL;
6946 result = unicode_encode_ucs1(unicode, errors, 128);
6947 Py_DECREF(unicode);
6948 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949}
6950
Alexander Belopolsky40018472011-02-26 01:02:56 +00006951PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006952_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953{
6954 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 PyErr_BadArgument();
6956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006958 if (PyUnicode_READY(unicode) == -1)
6959 return NULL;
6960 /* Fast path: if it is an ASCII-only string, construct bytes object
6961 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006962 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6964 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006965 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006966}
6967
6968PyObject *
6969PyUnicode_AsASCIIString(PyObject *unicode)
6970{
6971 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
Victor Stinner99b95382011-07-04 14:23:54 +02006974#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006975
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006976/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006977
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006978#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979#define NEED_RETRY
6980#endif
6981
Victor Stinner3a50e702011-10-18 21:21:00 +02006982#ifndef WC_ERR_INVALID_CHARS
6983# define WC_ERR_INVALID_CHARS 0x0080
6984#endif
6985
6986static char*
6987code_page_name(UINT code_page, PyObject **obj)
6988{
6989 *obj = NULL;
6990 if (code_page == CP_ACP)
6991 return "mbcs";
6992 if (code_page == CP_UTF7)
6993 return "CP_UTF7";
6994 if (code_page == CP_UTF8)
6995 return "CP_UTF8";
6996
6997 *obj = PyBytes_FromFormat("cp%u", code_page);
6998 if (*obj == NULL)
6999 return NULL;
7000 return PyBytes_AS_STRING(*obj);
7001}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002
Victor Stinner3a50e702011-10-18 21:21:00 +02007003static DWORD
7004decode_code_page_flags(UINT code_page)
7005{
7006 if (code_page == CP_UTF7) {
7007 /* The CP_UTF7 decoder only supports flags=0 */
7008 return 0;
7009 }
7010 else
7011 return MB_ERR_INVALID_CHARS;
7012}
7013
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 * Decode a byte string from a Windows code page into unicode object in strict
7016 * mode.
7017 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007018 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7019 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007021static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007022decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007023 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 const char *in,
7025 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026{
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007028 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
7031 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 assert(insize > 0);
7033 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7034 if (outsize <= 0)
7035 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007036
7037 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007039 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007040 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007041 if (*v == NULL)
7042 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044 }
7045 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007046 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007048 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051 }
7052
7053 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7055 if (outsize <= 0)
7056 goto error;
7057 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007058
Victor Stinner3a50e702011-10-18 21:21:00 +02007059error:
7060 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7061 return -2;
7062 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007063 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064}
7065
Victor Stinner3a50e702011-10-18 21:21:00 +02007066/*
7067 * Decode a byte string from a code page into unicode object with an error
7068 * handler.
7069 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007070 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 * UnicodeDecodeError exception and returns -1 on error.
7072 */
7073static int
7074decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 PyObject **v,
7076 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007077 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007078{
7079 const char *startin = in;
7080 const char *endin = in + size;
7081 const DWORD flags = decode_code_page_flags(code_page);
7082 /* Ideally, we should get reason from FormatMessage. This is the Windows
7083 2000 English version of the message. */
7084 const char *reason = "No mapping for the Unicode character exists "
7085 "in the target code page.";
7086 /* each step cannot decode more than 1 character, but a character can be
7087 represented as a surrogate pair */
7088 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007089 int insize;
7090 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 PyObject *errorHandler = NULL;
7092 PyObject *exc = NULL;
7093 PyObject *encoding_obj = NULL;
7094 char *encoding;
7095 DWORD err;
7096 int ret = -1;
7097
7098 assert(size > 0);
7099
7100 encoding = code_page_name(code_page, &encoding_obj);
7101 if (encoding == NULL)
7102 return -1;
7103
Victor Stinner7d00cc12014-03-17 23:08:06 +01007104 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7106 UnicodeDecodeError. */
7107 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7108 if (exc != NULL) {
7109 PyCodec_StrictErrors(exc);
7110 Py_CLEAR(exc);
7111 }
7112 goto error;
7113 }
7114
7115 if (*v == NULL) {
7116 /* Create unicode object */
7117 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7118 PyErr_NoMemory();
7119 goto error;
7120 }
Victor Stinnerab595942011-12-17 04:59:06 +01007121 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007122 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007123 if (*v == NULL)
7124 goto error;
7125 startout = PyUnicode_AS_UNICODE(*v);
7126 }
7127 else {
7128 /* Extend unicode object */
7129 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7130 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7131 PyErr_NoMemory();
7132 goto error;
7133 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007134 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 goto error;
7136 startout = PyUnicode_AS_UNICODE(*v) + n;
7137 }
7138
7139 /* Decode the byte string character per character */
7140 out = startout;
7141 while (in < endin)
7142 {
7143 /* Decode a character */
7144 insize = 1;
7145 do
7146 {
7147 outsize = MultiByteToWideChar(code_page, flags,
7148 in, insize,
7149 buffer, Py_ARRAY_LENGTH(buffer));
7150 if (outsize > 0)
7151 break;
7152 err = GetLastError();
7153 if (err != ERROR_NO_UNICODE_TRANSLATION
7154 && err != ERROR_INSUFFICIENT_BUFFER)
7155 {
7156 PyErr_SetFromWindowsErr(0);
7157 goto error;
7158 }
7159 insize++;
7160 }
7161 /* 4=maximum length of a UTF-8 sequence */
7162 while (insize <= 4 && (in + insize) <= endin);
7163
7164 if (outsize <= 0) {
7165 Py_ssize_t startinpos, endinpos, outpos;
7166
Victor Stinner7d00cc12014-03-17 23:08:06 +01007167 /* last character in partial decode? */
7168 if (in + insize >= endin && !final)
7169 break;
7170
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 startinpos = in - startin;
7172 endinpos = startinpos + 1;
7173 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007174 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 errors, &errorHandler,
7176 encoding, reason,
7177 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007178 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 {
7180 goto error;
7181 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007182 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 }
7184 else {
7185 in += insize;
7186 memcpy(out, buffer, outsize * sizeof(wchar_t));
7187 out += outsize;
7188 }
7189 }
7190
7191 /* write a NUL character at the end */
7192 *out = 0;
7193
7194 /* Extend unicode object */
7195 outsize = out - startout;
7196 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007197 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007199 /* (in - startin) <= size and size is an int */
7200 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007201
7202error:
7203 Py_XDECREF(encoding_obj);
7204 Py_XDECREF(errorHandler);
7205 Py_XDECREF(exc);
7206 return ret;
7207}
7208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209static PyObject *
7210decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 const char *s, Py_ssize_t size,
7212 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213{
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 PyObject *v = NULL;
7215 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007216
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 if (code_page < 0) {
7218 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7219 return NULL;
7220 }
7221
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007224
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 do
7226 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007228 if (size > INT_MAX) {
7229 chunk_size = INT_MAX;
7230 final = 0;
7231 done = 0;
7232 }
7233 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007235 {
7236 chunk_size = (int)size;
7237 final = (consumed == NULL);
7238 done = 1;
7239 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240
Victor Stinner76a31a62011-11-04 00:05:13 +01007241 if (chunk_size == 0 && done) {
7242 if (v != NULL)
7243 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007244 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007245 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246
Victor Stinner76a31a62011-11-04 00:05:13 +01007247 converted = decode_code_page_strict(code_page, &v,
7248 s, chunk_size);
7249 if (converted == -2)
7250 converted = decode_code_page_errors(code_page, &v,
7251 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007252 errors, final);
7253 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007254
7255 if (converted < 0) {
7256 Py_XDECREF(v);
7257 return NULL;
7258 }
7259
7260 if (consumed)
7261 *consumed += converted;
7262
7263 s += converted;
7264 size -= converted;
7265 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007266
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007267 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007268}
7269
Alexander Belopolsky40018472011-02-26 01:02:56 +00007270PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007271PyUnicode_DecodeCodePageStateful(int code_page,
7272 const char *s,
7273 Py_ssize_t size,
7274 const char *errors,
7275 Py_ssize_t *consumed)
7276{
7277 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7278}
7279
7280PyObject *
7281PyUnicode_DecodeMBCSStateful(const char *s,
7282 Py_ssize_t size,
7283 const char *errors,
7284 Py_ssize_t *consumed)
7285{
7286 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7287}
7288
7289PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007290PyUnicode_DecodeMBCS(const char *s,
7291 Py_ssize_t size,
7292 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007293{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007294 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7295}
7296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297static DWORD
7298encode_code_page_flags(UINT code_page, const char *errors)
7299{
7300 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007301 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 }
7303 else if (code_page == CP_UTF7) {
7304 /* CP_UTF7 only supports flags=0 */
7305 return 0;
7306 }
7307 else {
7308 if (errors != NULL && strcmp(errors, "replace") == 0)
7309 return 0;
7310 else
7311 return WC_NO_BEST_FIT_CHARS;
7312 }
7313}
7314
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007315/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 * Encode a Unicode string to a Windows code page into a byte string in strict
7317 * mode.
7318 *
7319 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007320 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007321 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007322static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007323encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326{
Victor Stinner554f3f02010-06-16 23:33:54 +00007327 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 BOOL *pusedDefaultChar = &usedDefaultChar;
7329 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007330 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007331 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007332 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 const DWORD flags = encode_code_page_flags(code_page, NULL);
7334 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007335 /* Create a substring so that we can get the UTF-16 representation
7336 of just the slice under consideration. */
7337 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007340
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007342 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007344 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007345
Victor Stinner2fc507f2011-11-04 20:06:39 +01007346 substring = PyUnicode_Substring(unicode, offset, offset+len);
7347 if (substring == NULL)
7348 return -1;
7349 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7350 if (p == NULL) {
7351 Py_DECREF(substring);
7352 return -1;
7353 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007354 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007355
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007356 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007358 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 NULL, 0,
7360 NULL, pusedDefaultChar);
7361 if (outsize <= 0)
7362 goto error;
7363 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007364 if (pusedDefaultChar && *pusedDefaultChar) {
7365 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007367 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007368
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007372 if (*outbytes == NULL) {
7373 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377 }
7378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 const Py_ssize_t n = PyBytes_Size(*outbytes);
7381 if (outsize > PY_SSIZE_T_MAX - n) {
7382 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007383 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007386 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7387 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007389 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391 }
7392
7393 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007395 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 out, outsize,
7397 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007398 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 if (outsize <= 0)
7400 goto error;
7401 if (pusedDefaultChar && *pusedDefaultChar)
7402 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007403 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007404
Victor Stinner3a50e702011-10-18 21:21:00 +02007405error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007406 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7408 return -2;
7409 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007410 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007411}
7412
Victor Stinner3a50e702011-10-18 21:21:00 +02007413/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007414 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 * error handler.
7416 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007417 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 * -1 on other error.
7419 */
7420static int
7421encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007422 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007423 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007424{
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 Py_ssize_t pos = unicode_offset;
7427 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 /* Ideally, we should get reason from FormatMessage. This is the Windows
7429 2000 English version of the message. */
7430 const char *reason = "invalid character";
7431 /* 4=maximum length of a UTF-8 sequence */
7432 char buffer[4];
7433 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7434 Py_ssize_t outsize;
7435 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 PyObject *errorHandler = NULL;
7437 PyObject *exc = NULL;
7438 PyObject *encoding_obj = NULL;
7439 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007440 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 PyObject *rep;
7442 int ret = -1;
7443
7444 assert(insize > 0);
7445
7446 encoding = code_page_name(code_page, &encoding_obj);
7447 if (encoding == NULL)
7448 return -1;
7449
7450 if (errors == NULL || strcmp(errors, "strict") == 0) {
7451 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7452 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007453 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 if (exc != NULL) {
7455 PyCodec_StrictErrors(exc);
7456 Py_DECREF(exc);
7457 }
7458 Py_XDECREF(encoding_obj);
7459 return -1;
7460 }
7461
7462 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7463 pusedDefaultChar = &usedDefaultChar;
7464 else
7465 pusedDefaultChar = NULL;
7466
7467 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7468 PyErr_NoMemory();
7469 goto error;
7470 }
7471 outsize = insize * Py_ARRAY_LENGTH(buffer);
7472
7473 if (*outbytes == NULL) {
7474 /* Create string object */
7475 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7476 if (*outbytes == NULL)
7477 goto error;
7478 out = PyBytes_AS_STRING(*outbytes);
7479 }
7480 else {
7481 /* Extend string object */
7482 Py_ssize_t n = PyBytes_Size(*outbytes);
7483 if (n > PY_SSIZE_T_MAX - outsize) {
7484 PyErr_NoMemory();
7485 goto error;
7486 }
7487 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7488 goto error;
7489 out = PyBytes_AS_STRING(*outbytes) + n;
7490 }
7491
7492 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007493 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007495 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7496 wchar_t chars[2];
7497 int charsize;
7498 if (ch < 0x10000) {
7499 chars[0] = (wchar_t)ch;
7500 charsize = 1;
7501 }
7502 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007503 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7504 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 charsize = 2;
7506 }
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007509 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 buffer, Py_ARRAY_LENGTH(buffer),
7511 NULL, pusedDefaultChar);
7512 if (outsize > 0) {
7513 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7514 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007515 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 memcpy(out, buffer, outsize);
7517 out += outsize;
7518 continue;
7519 }
7520 }
7521 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7522 PyErr_SetFromWindowsErr(0);
7523 goto error;
7524 }
7525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 rep = unicode_encode_call_errorhandler(
7527 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 if (rep == NULL)
7531 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007532 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533
7534 if (PyBytes_Check(rep)) {
7535 outsize = PyBytes_GET_SIZE(rep);
7536 if (outsize != 1) {
7537 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7538 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7539 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7540 Py_DECREF(rep);
7541 goto error;
7542 }
7543 out = PyBytes_AS_STRING(*outbytes) + offset;
7544 }
7545 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7546 out += outsize;
7547 }
7548 else {
7549 Py_ssize_t i;
7550 enum PyUnicode_Kind kind;
7551 void *data;
7552
Benjamin Petersonbac79492012-01-14 13:34:47 -05007553 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 Py_DECREF(rep);
7555 goto error;
7556 }
7557
7558 outsize = PyUnicode_GET_LENGTH(rep);
7559 if (outsize != 1) {
7560 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7561 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7562 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7563 Py_DECREF(rep);
7564 goto error;
7565 }
7566 out = PyBytes_AS_STRING(*outbytes) + offset;
7567 }
7568 kind = PyUnicode_KIND(rep);
7569 data = PyUnicode_DATA(rep);
7570 for (i=0; i < outsize; i++) {
7571 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7572 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007573 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007574 encoding, unicode,
7575 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 "unable to encode error handler result to ASCII");
7577 Py_DECREF(rep);
7578 goto error;
7579 }
7580 *out = (unsigned char)ch;
7581 out++;
7582 }
7583 }
7584 Py_DECREF(rep);
7585 }
7586 /* write a NUL byte */
7587 *out = 0;
7588 outsize = out - PyBytes_AS_STRING(*outbytes);
7589 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7590 if (_PyBytes_Resize(outbytes, outsize) < 0)
7591 goto error;
7592 ret = 0;
7593
7594error:
7595 Py_XDECREF(encoding_obj);
7596 Py_XDECREF(errorHandler);
7597 Py_XDECREF(exc);
7598 return ret;
7599}
7600
Victor Stinner3a50e702011-10-18 21:21:00 +02007601static PyObject *
7602encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007603 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 const char *errors)
7605{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007608 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007609 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007610
Victor Stinner29dacf22015-01-26 16:41:32 +01007611 if (!PyUnicode_Check(unicode)) {
7612 PyErr_BadArgument();
7613 return NULL;
7614 }
7615
Benjamin Petersonbac79492012-01-14 13:34:47 -05007616 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007617 return NULL;
7618 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007619
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 if (code_page < 0) {
7621 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7622 return NULL;
7623 }
7624
Martin v. Löwis3d325192011-11-04 18:23:06 +01007625 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007626 return PyBytes_FromStringAndSize(NULL, 0);
7627
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 offset = 0;
7629 do
7630 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007632 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007633 chunks. */
7634 if (len > INT_MAX/2) {
7635 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007636 done = 0;
7637 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007638 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007639#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007640 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 done = 1;
7643 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007644
Victor Stinner76a31a62011-11-04 00:05:13 +01007645 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007647 errors);
7648 if (ret == -2)
7649 ret = encode_code_page_errors(code_page, &outbytes,
7650 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007652 if (ret < 0) {
7653 Py_XDECREF(outbytes);
7654 return NULL;
7655 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007656
Victor Stinner7581cef2011-11-03 22:32:33 +01007657 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007660
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 return outbytes;
7662}
7663
7664PyObject *
7665PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7666 Py_ssize_t size,
7667 const char *errors)
7668{
Victor Stinner7581cef2011-11-03 22:32:33 +01007669 PyObject *unicode, *res;
7670 unicode = PyUnicode_FromUnicode(p, size);
7671 if (unicode == NULL)
7672 return NULL;
7673 res = encode_code_page(CP_ACP, unicode, errors);
7674 Py_DECREF(unicode);
7675 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007676}
7677
7678PyObject *
7679PyUnicode_EncodeCodePage(int code_page,
7680 PyObject *unicode,
7681 const char *errors)
7682{
Victor Stinner7581cef2011-11-03 22:32:33 +01007683 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007684}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007685
Alexander Belopolsky40018472011-02-26 01:02:56 +00007686PyObject *
7687PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007688{
Victor Stinner7581cef2011-11-03 22:32:33 +01007689 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007690}
7691
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007692#undef NEED_RETRY
7693
Victor Stinner99b95382011-07-04 14:23:54 +02007694#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007695
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696/* --- Character Mapping Codec -------------------------------------------- */
7697
Victor Stinnerfb161b12013-04-18 01:44:27 +02007698static int
7699charmap_decode_string(const char *s,
7700 Py_ssize_t size,
7701 PyObject *mapping,
7702 const char *errors,
7703 _PyUnicodeWriter *writer)
7704{
7705 const char *starts = s;
7706 const char *e;
7707 Py_ssize_t startinpos, endinpos;
7708 PyObject *errorHandler = NULL, *exc = NULL;
7709 Py_ssize_t maplen;
7710 enum PyUnicode_Kind mapkind;
7711 void *mapdata;
7712 Py_UCS4 x;
7713 unsigned char ch;
7714
7715 if (PyUnicode_READY(mapping) == -1)
7716 return -1;
7717
7718 maplen = PyUnicode_GET_LENGTH(mapping);
7719 mapdata = PyUnicode_DATA(mapping);
7720 mapkind = PyUnicode_KIND(mapping);
7721
7722 e = s + size;
7723
7724 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7725 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7726 * is disabled in encoding aliases, latin1 is preferred because
7727 * its implementation is faster. */
7728 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7729 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7730 Py_UCS4 maxchar = writer->maxchar;
7731
7732 assert (writer->kind == PyUnicode_1BYTE_KIND);
7733 while (s < e) {
7734 ch = *s;
7735 x = mapdata_ucs1[ch];
7736 if (x > maxchar) {
7737 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7738 goto onError;
7739 maxchar = writer->maxchar;
7740 outdata = (Py_UCS1 *)writer->data;
7741 }
7742 outdata[writer->pos] = x;
7743 writer->pos++;
7744 ++s;
7745 }
7746 return 0;
7747 }
7748
7749 while (s < e) {
7750 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7751 enum PyUnicode_Kind outkind = writer->kind;
7752 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7753 if (outkind == PyUnicode_1BYTE_KIND) {
7754 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7755 Py_UCS4 maxchar = writer->maxchar;
7756 while (s < e) {
7757 ch = *s;
7758 x = mapdata_ucs2[ch];
7759 if (x > maxchar)
7760 goto Error;
7761 outdata[writer->pos] = x;
7762 writer->pos++;
7763 ++s;
7764 }
7765 break;
7766 }
7767 else if (outkind == PyUnicode_2BYTE_KIND) {
7768 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7769 while (s < e) {
7770 ch = *s;
7771 x = mapdata_ucs2[ch];
7772 if (x == 0xFFFE)
7773 goto Error;
7774 outdata[writer->pos] = x;
7775 writer->pos++;
7776 ++s;
7777 }
7778 break;
7779 }
7780 }
7781 ch = *s;
7782
7783 if (ch < maplen)
7784 x = PyUnicode_READ(mapkind, mapdata, ch);
7785 else
7786 x = 0xfffe; /* invalid value */
7787Error:
7788 if (x == 0xfffe)
7789 {
7790 /* undefined mapping */
7791 startinpos = s-starts;
7792 endinpos = startinpos+1;
7793 if (unicode_decode_call_errorhandler_writer(
7794 errors, &errorHandler,
7795 "charmap", "character maps to <undefined>",
7796 &starts, &e, &startinpos, &endinpos, &exc, &s,
7797 writer)) {
7798 goto onError;
7799 }
7800 continue;
7801 }
7802
7803 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7804 goto onError;
7805 ++s;
7806 }
7807 Py_XDECREF(errorHandler);
7808 Py_XDECREF(exc);
7809 return 0;
7810
7811onError:
7812 Py_XDECREF(errorHandler);
7813 Py_XDECREF(exc);
7814 return -1;
7815}
7816
7817static int
7818charmap_decode_mapping(const char *s,
7819 Py_ssize_t size,
7820 PyObject *mapping,
7821 const char *errors,
7822 _PyUnicodeWriter *writer)
7823{
7824 const char *starts = s;
7825 const char *e;
7826 Py_ssize_t startinpos, endinpos;
7827 PyObject *errorHandler = NULL, *exc = NULL;
7828 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007829 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007830
7831 e = s + size;
7832
7833 while (s < e) {
7834 ch = *s;
7835
7836 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7837 key = PyLong_FromLong((long)ch);
7838 if (key == NULL)
7839 goto onError;
7840
7841 item = PyObject_GetItem(mapping, key);
7842 Py_DECREF(key);
7843 if (item == NULL) {
7844 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7845 /* No mapping found means: mapping is undefined. */
7846 PyErr_Clear();
7847 goto Undefined;
7848 } else
7849 goto onError;
7850 }
7851
7852 /* Apply mapping */
7853 if (item == Py_None)
7854 goto Undefined;
7855 if (PyLong_Check(item)) {
7856 long value = PyLong_AS_LONG(item);
7857 if (value == 0xFFFE)
7858 goto Undefined;
7859 if (value < 0 || value > MAX_UNICODE) {
7860 PyErr_Format(PyExc_TypeError,
7861 "character mapping must be in range(0x%lx)",
7862 (unsigned long)MAX_UNICODE + 1);
7863 goto onError;
7864 }
7865
7866 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7867 goto onError;
7868 }
7869 else if (PyUnicode_Check(item)) {
7870 if (PyUnicode_READY(item) == -1)
7871 goto onError;
7872 if (PyUnicode_GET_LENGTH(item) == 1) {
7873 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7874 if (value == 0xFFFE)
7875 goto Undefined;
7876 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7877 goto onError;
7878 }
7879 else {
7880 writer->overallocate = 1;
7881 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7882 goto onError;
7883 }
7884 }
7885 else {
7886 /* wrong return value */
7887 PyErr_SetString(PyExc_TypeError,
7888 "character mapping must return integer, None or str");
7889 goto onError;
7890 }
7891 Py_CLEAR(item);
7892 ++s;
7893 continue;
7894
7895Undefined:
7896 /* undefined mapping */
7897 Py_CLEAR(item);
7898 startinpos = s-starts;
7899 endinpos = startinpos+1;
7900 if (unicode_decode_call_errorhandler_writer(
7901 errors, &errorHandler,
7902 "charmap", "character maps to <undefined>",
7903 &starts, &e, &startinpos, &endinpos, &exc, &s,
7904 writer)) {
7905 goto onError;
7906 }
7907 }
7908 Py_XDECREF(errorHandler);
7909 Py_XDECREF(exc);
7910 return 0;
7911
7912onError:
7913 Py_XDECREF(item);
7914 Py_XDECREF(errorHandler);
7915 Py_XDECREF(exc);
7916 return -1;
7917}
7918
Alexander Belopolsky40018472011-02-26 01:02:56 +00007919PyObject *
7920PyUnicode_DecodeCharmap(const char *s,
7921 Py_ssize_t size,
7922 PyObject *mapping,
7923 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007925 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007926
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 /* Default to Latin-1 */
7928 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007932 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007933 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007934 writer.min_length = size;
7935 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007937
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007938 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007939 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7940 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007941 }
7942 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007943 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7944 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007946 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007947
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007949 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950 return NULL;
7951}
7952
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953/* Charmap encoding: the lookup table */
7954
Alexander Belopolsky40018472011-02-26 01:02:56 +00007955struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 PyObject_HEAD
7957 unsigned char level1[32];
7958 int count2, count3;
7959 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960};
7961
7962static PyObject*
7963encoding_map_size(PyObject *obj, PyObject* args)
7964{
7965 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007966 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968}
7969
7970static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007971 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 PyDoc_STR("Return the size (in bytes) of this object") },
7973 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007974};
7975
7976static void
7977encoding_map_dealloc(PyObject* o)
7978{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007979 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980}
7981
7982static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007983 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 "EncodingMap", /*tp_name*/
7985 sizeof(struct encoding_map), /*tp_basicsize*/
7986 0, /*tp_itemsize*/
7987 /* methods */
7988 encoding_map_dealloc, /*tp_dealloc*/
7989 0, /*tp_print*/
7990 0, /*tp_getattr*/
7991 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007992 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 0, /*tp_repr*/
7994 0, /*tp_as_number*/
7995 0, /*tp_as_sequence*/
7996 0, /*tp_as_mapping*/
7997 0, /*tp_hash*/
7998 0, /*tp_call*/
7999 0, /*tp_str*/
8000 0, /*tp_getattro*/
8001 0, /*tp_setattro*/
8002 0, /*tp_as_buffer*/
8003 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8004 0, /*tp_doc*/
8005 0, /*tp_traverse*/
8006 0, /*tp_clear*/
8007 0, /*tp_richcompare*/
8008 0, /*tp_weaklistoffset*/
8009 0, /*tp_iter*/
8010 0, /*tp_iternext*/
8011 encoding_map_methods, /*tp_methods*/
8012 0, /*tp_members*/
8013 0, /*tp_getset*/
8014 0, /*tp_base*/
8015 0, /*tp_dict*/
8016 0, /*tp_descr_get*/
8017 0, /*tp_descr_set*/
8018 0, /*tp_dictoffset*/
8019 0, /*tp_init*/
8020 0, /*tp_alloc*/
8021 0, /*tp_new*/
8022 0, /*tp_free*/
8023 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024};
8025
8026PyObject*
8027PyUnicode_BuildEncodingMap(PyObject* string)
8028{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 PyObject *result;
8030 struct encoding_map *mresult;
8031 int i;
8032 int need_dict = 0;
8033 unsigned char level1[32];
8034 unsigned char level2[512];
8035 unsigned char *mlevel1, *mlevel2, *mlevel3;
8036 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037 int kind;
8038 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008039 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008042 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 PyErr_BadArgument();
8044 return NULL;
8045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 kind = PyUnicode_KIND(string);
8047 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008048 length = PyUnicode_GET_LENGTH(string);
8049 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050 memset(level1, 0xFF, sizeof level1);
8051 memset(level2, 0xFF, sizeof level2);
8052
8053 /* If there isn't a one-to-one mapping of NULL to \0,
8054 or if there are non-BMP characters, we need to use
8055 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008056 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008058 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008060 ch = PyUnicode_READ(kind, data, i);
8061 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 need_dict = 1;
8063 break;
8064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008065 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 /* unmapped character */
8067 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008068 l1 = ch >> 11;
8069 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070 if (level1[l1] == 0xFF)
8071 level1[l1] = count2++;
8072 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008073 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008074 }
8075
8076 if (count2 >= 0xFF || count3 >= 0xFF)
8077 need_dict = 1;
8078
8079 if (need_dict) {
8080 PyObject *result = PyDict_New();
8081 PyObject *key, *value;
8082 if (!result)
8083 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008084 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008086 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087 if (!key || !value)
8088 goto failed1;
8089 if (PyDict_SetItem(result, key, value) == -1)
8090 goto failed1;
8091 Py_DECREF(key);
8092 Py_DECREF(value);
8093 }
8094 return result;
8095 failed1:
8096 Py_XDECREF(key);
8097 Py_XDECREF(value);
8098 Py_DECREF(result);
8099 return NULL;
8100 }
8101
8102 /* Create a three-level trie */
8103 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8104 16*count2 + 128*count3 - 1);
8105 if (!result)
8106 return PyErr_NoMemory();
8107 PyObject_Init(result, &EncodingMapType);
8108 mresult = (struct encoding_map*)result;
8109 mresult->count2 = count2;
8110 mresult->count3 = count3;
8111 mlevel1 = mresult->level1;
8112 mlevel2 = mresult->level23;
8113 mlevel3 = mresult->level23 + 16*count2;
8114 memcpy(mlevel1, level1, 32);
8115 memset(mlevel2, 0xFF, 16*count2);
8116 memset(mlevel3, 0, 128*count3);
8117 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008118 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008120 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8121 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 /* unmapped character */
8123 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008124 o1 = ch>>11;
8125 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 i2 = 16*mlevel1[o1] + o2;
8127 if (mlevel2[i2] == 0xFF)
8128 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008129 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008130 i3 = 128*mlevel2[i2] + o3;
8131 mlevel3[i3] = i;
8132 }
8133 return result;
8134}
8135
8136static int
Victor Stinner22168992011-11-20 17:09:18 +01008137encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138{
8139 struct encoding_map *map = (struct encoding_map*)mapping;
8140 int l1 = c>>11;
8141 int l2 = (c>>7) & 0xF;
8142 int l3 = c & 0x7F;
8143 int i;
8144
Victor Stinner22168992011-11-20 17:09:18 +01008145 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008147 if (c == 0)
8148 return 0;
8149 /* level 1*/
8150 i = map->level1[l1];
8151 if (i == 0xFF) {
8152 return -1;
8153 }
8154 /* level 2*/
8155 i = map->level23[16*i+l2];
8156 if (i == 0xFF) {
8157 return -1;
8158 }
8159 /* level 3 */
8160 i = map->level23[16*map->count2 + 128*i + l3];
8161 if (i == 0) {
8162 return -1;
8163 }
8164 return i;
8165}
8166
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008167/* Lookup the character ch in the mapping. If the character
8168 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008169 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008170static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008171charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172{
Christian Heimes217cfd12007-12-02 14:31:20 +00008173 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174 PyObject *x;
8175
8176 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 x = PyObject_GetItem(mapping, w);
8179 Py_DECREF(w);
8180 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8182 /* No mapping found means: mapping is undefined. */
8183 PyErr_Clear();
8184 x = Py_None;
8185 Py_INCREF(x);
8186 return x;
8187 } else
8188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008190 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008192 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 long value = PyLong_AS_LONG(x);
8194 if (value < 0 || value > 255) {
8195 PyErr_SetString(PyExc_TypeError,
8196 "character mapping must be in range(256)");
8197 Py_DECREF(x);
8198 return NULL;
8199 }
8200 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008202 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 /* wrong return value */
8206 PyErr_Format(PyExc_TypeError,
8207 "character mapping must return integer, bytes or None, not %.400s",
8208 x->ob_type->tp_name);
8209 Py_DECREF(x);
8210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 }
8212}
8213
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008215charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8218 /* exponentially overallocate to minimize reallocations */
8219 if (requiredsize < 2*outsize)
8220 requiredsize = 2*outsize;
8221 if (_PyBytes_Resize(outobj, requiredsize))
8222 return -1;
8223 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224}
8225
Benjamin Peterson14339b62009-01-31 16:36:08 +00008226typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008228} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008230 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 space is available. Return a new reference to the object that
8232 was put in the output buffer, or Py_None, if the mapping was undefined
8233 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008234 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008235static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008236charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008237 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239 PyObject *rep;
8240 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008241 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242
Christian Heimes90aa7642007-12-19 02:45:37 +00008243 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008244 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 if (res == -1)
8247 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 if (outsize<requiredsize)
8249 if (charmapencode_resize(outobj, outpos, requiredsize))
8250 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008251 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 outstart[(*outpos)++] = (char)res;
8253 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008254 }
8255
8256 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 Py_DECREF(rep);
8261 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 if (PyLong_Check(rep)) {
8264 Py_ssize_t requiredsize = *outpos+1;
8265 if (outsize<requiredsize)
8266 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8267 Py_DECREF(rep);
8268 return enc_EXCEPTION;
8269 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008270 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 else {
8274 const char *repchars = PyBytes_AS_STRING(rep);
8275 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8276 Py_ssize_t requiredsize = *outpos+repsize;
8277 if (outsize<requiredsize)
8278 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8279 Py_DECREF(rep);
8280 return enc_EXCEPTION;
8281 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008282 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 memcpy(outstart + *outpos, repchars, repsize);
8284 *outpos += repsize;
8285 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287 Py_DECREF(rep);
8288 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289}
8290
8291/* handle an error in PyUnicode_EncodeCharmap
8292 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008293static int
8294charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008297 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008298 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299{
8300 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008301 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008303 enum PyUnicode_Kind kind;
8304 void *data;
8305 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008307 Py_ssize_t collstartpos = *inpos;
8308 Py_ssize_t collendpos = *inpos+1;
8309 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008310 char *encoding = "charmap";
8311 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008313 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008314 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315
Benjamin Petersonbac79492012-01-14 13:34:47 -05008316 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008317 return -1;
8318 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 /* find all unencodable characters */
8320 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008321 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008322 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008323 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008324 val = encoding_map_lookup(ch, mapping);
8325 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 break;
8327 ++collendpos;
8328 continue;
8329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8332 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 if (rep==NULL)
8334 return -1;
8335 else if (rep!=Py_None) {
8336 Py_DECREF(rep);
8337 break;
8338 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008339 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 }
8342 /* cache callback name lookup
8343 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008344 if (*error_handler == _Py_ERROR_UNKNOWN)
8345 *error_handler = get_error_handler(errors);
8346
8347 switch (*error_handler) {
8348 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008349 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008350 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008351
8352 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008353 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 x = charmapencode_output('?', mapping, res, respos);
8355 if (x==enc_EXCEPTION) {
8356 return -1;
8357 }
8358 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008359 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 return -1;
8361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008362 }
8363 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008364 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008365 *inpos = collendpos;
8366 break;
Victor Stinner50149202015-09-22 00:26:54 +02008367
8368 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008369 /* generate replacement (temporarily (mis)uses p) */
8370 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 char buffer[2+29+1+1];
8372 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008373 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 for (cp = buffer; *cp; ++cp) {
8375 x = charmapencode_output(*cp, mapping, res, respos);
8376 if (x==enc_EXCEPTION)
8377 return -1;
8378 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008379 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return -1;
8381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008382 }
8383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008384 *inpos = collendpos;
8385 break;
Victor Stinner50149202015-09-22 00:26:54 +02008386
Benjamin Peterson14339b62009-01-31 16:36:08 +00008387 default:
Victor Stinner50149202015-09-22 00:26:54 +02008388 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008389 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008391 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008393 if (PyBytes_Check(repunicode)) {
8394 /* Directly copy bytes result to output. */
8395 Py_ssize_t outsize = PyBytes_Size(*res);
8396 Py_ssize_t requiredsize;
8397 repsize = PyBytes_Size(repunicode);
8398 requiredsize = *respos + repsize;
8399 if (requiredsize > outsize)
8400 /* Make room for all additional bytes. */
8401 if (charmapencode_resize(res, respos, requiredsize)) {
8402 Py_DECREF(repunicode);
8403 return -1;
8404 }
8405 memcpy(PyBytes_AsString(*res) + *respos,
8406 PyBytes_AsString(repunicode), repsize);
8407 *respos += repsize;
8408 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008409 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008410 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008411 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008413 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008414 Py_DECREF(repunicode);
8415 return -1;
8416 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008417 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008418 data = PyUnicode_DATA(repunicode);
8419 kind = PyUnicode_KIND(repunicode);
8420 for (index = 0; index < repsize; index++) {
8421 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8422 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008424 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 return -1;
8426 }
8427 else if (x==enc_FAILED) {
8428 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008429 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 return -1;
8431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 }
8433 *inpos = newpos;
8434 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 }
8436 return 0;
8437}
8438
Alexander Belopolsky40018472011-02-26 01:02:56 +00008439PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008440_PyUnicode_EncodeCharmap(PyObject *unicode,
8441 PyObject *mapping,
8442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 /* output object */
8445 PyObject *res = NULL;
8446 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008447 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008448 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008451 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008453 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008454 void *data;
8455 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456
Benjamin Petersonbac79492012-01-14 13:34:47 -05008457 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008458 return NULL;
8459 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008460 data = PyUnicode_DATA(unicode);
8461 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 /* Default to Latin-1 */
8464 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008465 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467 /* allocate enough for a simple encoding without
8468 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008469 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 if (res == NULL)
8471 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008472 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008476 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008478 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 if (x==enc_EXCEPTION) /* error */
8480 goto onError;
8481 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008482 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008484 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 &res, &respos)) {
8486 goto onError;
8487 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 else
8490 /* done with this character => adjust input position */
8491 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008494 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008495 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008496 if (_PyBytes_Resize(&res, respos) < 0)
8497 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008498
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008500 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 return res;
8502
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 Py_XDECREF(res);
8505 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008506 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 return NULL;
8508}
8509
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008510/* Deprecated */
8511PyObject *
8512PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8513 Py_ssize_t size,
8514 PyObject *mapping,
8515 const char *errors)
8516{
8517 PyObject *result;
8518 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8519 if (unicode == NULL)
8520 return NULL;
8521 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8522 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008523 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008524}
8525
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526PyObject *
8527PyUnicode_AsCharmapString(PyObject *unicode,
8528 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529{
8530 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 PyErr_BadArgument();
8532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008534 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535}
8536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008538static void
8539make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541 Py_ssize_t startpos, Py_ssize_t endpos,
8542 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 *exceptionObject = _PyUnicodeTranslateError_Create(
8546 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547 }
8548 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8550 goto onError;
8551 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8552 goto onError;
8553 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8554 goto onError;
8555 return;
8556 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008557 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 }
8559}
8560
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561/* error handling callback helper:
8562 build arguments, call the callback and check the arguments,
8563 put the result into newpos and return the replacement string, which
8564 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008565static PyObject *
8566unicode_translate_call_errorhandler(const char *errors,
8567 PyObject **errorHandler,
8568 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008570 Py_ssize_t startpos, Py_ssize_t endpos,
8571 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008573 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008575 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 PyObject *restuple;
8577 PyObject *resunicode;
8578
8579 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 }
8584
8585 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589
8590 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008595 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 Py_DECREF(restuple);
8597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 }
8599 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 &resunicode, &i_newpos)) {
8601 Py_DECREF(restuple);
8602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008604 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008606 else
8607 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008609 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 Py_DECREF(restuple);
8611 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 Py_INCREF(resunicode);
8614 Py_DECREF(restuple);
8615 return resunicode;
8616}
8617
8618/* Lookup the character ch in the mapping and put the result in result,
8619 which must be decrefed by the caller.
8620 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008621static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623{
Christian Heimes217cfd12007-12-02 14:31:20 +00008624 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 PyObject *x;
8626
8627 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 x = PyObject_GetItem(mapping, w);
8630 Py_DECREF(w);
8631 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8633 /* No mapping found means: use 1:1 mapping. */
8634 PyErr_Clear();
8635 *result = NULL;
8636 return 0;
8637 } else
8638 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 }
8640 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 *result = x;
8642 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008644 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008646 if (value < 0 || value > MAX_UNICODE) {
8647 PyErr_Format(PyExc_ValueError,
8648 "character mapping must be in range(0x%x)",
8649 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 Py_DECREF(x);
8651 return -1;
8652 }
8653 *result = x;
8654 return 0;
8655 }
8656 else if (PyUnicode_Check(x)) {
8657 *result = x;
8658 return 0;
8659 }
8660 else {
8661 /* wrong return value */
8662 PyErr_SetString(PyExc_TypeError,
8663 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008664 Py_DECREF(x);
8665 return -1;
8666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667}
Victor Stinner1194ea02014-04-04 19:37:40 +02008668
8669/* lookup the character, write the result into the writer.
8670 Return 1 if the result was written into the writer, return 0 if the mapping
8671 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008673charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8674 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675{
Victor Stinner1194ea02014-04-04 19:37:40 +02008676 PyObject *item;
8677
8678 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008680
8681 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008683 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008686 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008688
8689 if (item == Py_None) {
8690 Py_DECREF(item);
8691 return 0;
8692 }
8693
8694 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008695 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8696 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8697 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008698 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8699 Py_DECREF(item);
8700 return -1;
8701 }
8702 Py_DECREF(item);
8703 return 1;
8704 }
8705
8706 if (!PyUnicode_Check(item)) {
8707 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008709 }
8710
8711 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8712 Py_DECREF(item);
8713 return -1;
8714 }
8715
8716 Py_DECREF(item);
8717 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718}
8719
Victor Stinner89a76ab2014-04-05 11:44:04 +02008720static int
8721unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8722 Py_UCS1 *translate)
8723{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008724 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008725 int ret = 0;
8726
Victor Stinner89a76ab2014-04-05 11:44:04 +02008727 if (charmaptranslate_lookup(ch, mapping, &item)) {
8728 return -1;
8729 }
8730
8731 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008732 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008733 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008734 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008735 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008736 /* not found => default to 1:1 mapping */
8737 translate[ch] = ch;
8738 return 1;
8739 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008740 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008741 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008742 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8743 used it */
8744 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008745 /* invalid character or character outside ASCII:
8746 skip the fast translate */
8747 goto exit;
8748 }
8749 translate[ch] = (Py_UCS1)replace;
8750 }
8751 else if (PyUnicode_Check(item)) {
8752 Py_UCS4 replace;
8753
8754 if (PyUnicode_READY(item) == -1) {
8755 Py_DECREF(item);
8756 return -1;
8757 }
8758 if (PyUnicode_GET_LENGTH(item) != 1)
8759 goto exit;
8760
8761 replace = PyUnicode_READ_CHAR(item, 0);
8762 if (replace > 127)
8763 goto exit;
8764 translate[ch] = (Py_UCS1)replace;
8765 }
8766 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008767 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008768 goto exit;
8769 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008770 ret = 1;
8771
Benjamin Peterson1365de72014-04-07 20:15:41 -04008772 exit:
8773 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008774 return ret;
8775}
8776
8777/* Fast path for ascii => ascii translation. Return 1 if the whole string
8778 was translated into writer, return 0 if the input string was partially
8779 translated into writer, raise an exception and return -1 on error. */
8780static int
8781unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008782 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008783{
Victor Stinner872b2912014-04-05 14:27:07 +02008784 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008785 Py_ssize_t len;
8786 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008787 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008788
8789 if (PyUnicode_READY(input) == -1)
8790 return -1;
8791 if (!PyUnicode_IS_ASCII(input))
8792 return 0;
8793 len = PyUnicode_GET_LENGTH(input);
8794
Victor Stinner872b2912014-04-05 14:27:07 +02008795 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008796
8797 in = PyUnicode_1BYTE_DATA(input);
8798 end = in + len;
8799
8800 assert(PyUnicode_IS_ASCII(writer->buffer));
8801 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8802 out = PyUnicode_1BYTE_DATA(writer->buffer);
8803
Victor Stinner872b2912014-04-05 14:27:07 +02008804 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008806 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008808 int translate = unicode_fast_translate_lookup(mapping, ch,
8809 ascii_table);
8810 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008811 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008812 if (translate == 0)
8813 goto exit;
8814 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008815 }
Victor Stinner872b2912014-04-05 14:27:07 +02008816 if (ch2 == 0xfe) {
8817 if (ignore)
8818 continue;
8819 goto exit;
8820 }
8821 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008822 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008823 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008824 }
Victor Stinner872b2912014-04-05 14:27:07 +02008825 res = 1;
8826
8827exit:
8828 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8829 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830}
8831
Victor Stinner3222da22015-10-01 22:07:32 +02008832static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833_PyUnicode_TranslateCharmap(PyObject *input,
8834 PyObject *mapping,
8835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008838 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 Py_ssize_t size, i;
8840 int kind;
8841 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008842 _PyUnicodeWriter writer;
8843 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844 char *reason = "character maps to <undefined>";
8845 PyObject *errorHandler = NULL;
8846 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008847 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008848 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008849
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 PyErr_BadArgument();
8852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 if (PyUnicode_READY(input) == -1)
8856 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008857 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 kind = PyUnicode_KIND(input);
8859 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860
8861 if (size == 0) {
8862 Py_INCREF(input);
8863 return input;
8864 }
8865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866 /* allocate enough for a simple 1:1 translation without
8867 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008868 _PyUnicodeWriter_Init(&writer);
8869 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
Victor Stinner872b2912014-04-05 14:27:07 +02008872 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8873
8874 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 if (res < 0) {
8876 _PyUnicodeWriter_Dealloc(&writer);
8877 return NULL;
8878 }
8879 if (res == 1)
8880 return _PyUnicodeWriter_Finish(&writer);
8881
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008885 int translate;
8886 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8887 Py_ssize_t newpos;
8888 /* startpos for collecting untranslatable chars */
8889 Py_ssize_t collstart;
8890 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008891 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892
Victor Stinner1194ea02014-04-04 19:37:40 +02008893 ch = PyUnicode_READ(kind, data, i);
8894 translate = charmaptranslate_output(ch, mapping, &writer);
8895 if (translate < 0)
8896 goto onError;
8897
8898 if (translate != 0) {
8899 /* it worked => adjust input pointer */
8900 ++i;
8901 continue;
8902 }
8903
8904 /* untranslatable character */
8905 collstart = i;
8906 collend = i+1;
8907
8908 /* find all untranslatable characters */
8909 while (collend < size) {
8910 PyObject *x;
8911 ch = PyUnicode_READ(kind, data, collend);
8912 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008914 Py_XDECREF(x);
8915 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008917 ++collend;
8918 }
8919
8920 if (ignore) {
8921 i = collend;
8922 }
8923 else {
8924 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8925 reason, input, &exc,
8926 collstart, collend, &newpos);
8927 if (repunicode == NULL)
8928 goto onError;
8929 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008931 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008933 Py_DECREF(repunicode);
8934 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008935 }
8936 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937 Py_XDECREF(exc);
8938 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008942 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008943 Py_XDECREF(exc);
8944 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 return NULL;
8946}
8947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948/* Deprecated. Use PyUnicode_Translate instead. */
8949PyObject *
8950PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8951 Py_ssize_t size,
8952 PyObject *mapping,
8953 const char *errors)
8954{
Christian Heimes5f520f42012-09-11 14:03:25 +02008955 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8957 if (!unicode)
8958 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008959 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8960 Py_DECREF(unicode);
8961 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962}
8963
Alexander Belopolsky40018472011-02-26 01:02:56 +00008964PyObject *
8965PyUnicode_Translate(PyObject *str,
8966 PyObject *mapping,
8967 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
8969 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008970
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 str = PyUnicode_FromObject(str);
8972 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008973 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 Py_DECREF(str);
8976 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977}
Tim Petersced69f82003-09-16 20:30:58 +00008978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008980fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981{
8982 /* No need to call PyUnicode_READY(self) because this function is only
8983 called as a callback from fixup() which does it already. */
8984 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8985 const int kind = PyUnicode_KIND(self);
8986 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008987 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008988 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 Py_ssize_t i;
8990
8991 for (i = 0; i < len; ++i) {
8992 ch = PyUnicode_READ(kind, data, i);
8993 fixed = 0;
8994 if (ch > 127) {
8995 if (Py_UNICODE_ISSPACE(ch))
8996 fixed = ' ';
8997 else {
8998 const int decimal = Py_UNICODE_TODECIMAL(ch);
8999 if (decimal >= 0)
9000 fixed = '0' + decimal;
9001 }
9002 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009003 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009004 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 PyUnicode_WRITE(kind, data, i, fixed);
9006 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009007 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009008 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 }
9011
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009012 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013}
9014
9015PyObject *
9016_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9017{
9018 if (!PyUnicode_Check(unicode)) {
9019 PyErr_BadInternalCall();
9020 return NULL;
9021 }
9022 if (PyUnicode_READY(unicode) == -1)
9023 return NULL;
9024 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9025 /* If the string is already ASCII, just return the same string */
9026 Py_INCREF(unicode);
9027 return unicode;
9028 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009029 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030}
9031
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009032PyObject *
9033PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9034 Py_ssize_t length)
9035{
Victor Stinnerf0124502011-11-21 23:12:56 +01009036 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009037 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009038 Py_UCS4 maxchar;
9039 enum PyUnicode_Kind kind;
9040 void *data;
9041
Victor Stinner99d7ad02012-02-22 13:37:39 +01009042 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009043 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009044 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009045 if (ch > 127) {
9046 int decimal = Py_UNICODE_TODECIMAL(ch);
9047 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009048 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009049 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009050 }
9051 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009052
9053 /* Copy to a new string */
9054 decimal = PyUnicode_New(length, maxchar);
9055 if (decimal == NULL)
9056 return decimal;
9057 kind = PyUnicode_KIND(decimal);
9058 data = PyUnicode_DATA(decimal);
9059 /* Iterate over code points */
9060 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009061 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009062 if (ch > 127) {
9063 int decimal = Py_UNICODE_TODECIMAL(ch);
9064 if (decimal >= 0)
9065 ch = '0' + decimal;
9066 }
9067 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009069 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009070}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009071/* --- Decimal Encoder ---------------------------------------------------- */
9072
Alexander Belopolsky40018472011-02-26 01:02:56 +00009073int
9074PyUnicode_EncodeDecimal(Py_UNICODE *s,
9075 Py_ssize_t length,
9076 char *output,
9077 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009078{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009079 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009080 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009081 enum PyUnicode_Kind kind;
9082 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009083
9084 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 PyErr_BadArgument();
9086 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009087 }
9088
Victor Stinner42bf7752011-11-21 22:52:58 +01009089 unicode = PyUnicode_FromUnicode(s, length);
9090 if (unicode == NULL)
9091 return -1;
9092
Benjamin Petersonbac79492012-01-14 13:34:47 -05009093 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009094 Py_DECREF(unicode);
9095 return -1;
9096 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009097 kind = PyUnicode_KIND(unicode);
9098 data = PyUnicode_DATA(unicode);
9099
Victor Stinnerb84d7232011-11-22 01:50:07 +01009100 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009101 PyObject *exc;
9102 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009104 Py_ssize_t startpos;
9105
9106 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009107
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009109 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009110 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 decimal = Py_UNICODE_TODECIMAL(ch);
9114 if (decimal >= 0) {
9115 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009116 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 continue;
9118 }
9119 if (0 < ch && ch < 256) {
9120 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009121 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 continue;
9123 }
Victor Stinner6345be92011-11-25 20:09:01 +01009124
Victor Stinner42bf7752011-11-21 22:52:58 +01009125 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009126 exc = NULL;
9127 raise_encode_exception(&exc, "decimal", unicode,
9128 startpos, startpos+1,
9129 "invalid decimal Unicode string");
9130 Py_XDECREF(exc);
9131 Py_DECREF(unicode);
9132 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009133 }
9134 /* 0-terminate the output string */
9135 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009136 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009137 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009138}
9139
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140/* --- Helpers ------------------------------------------------------------ */
9141
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009142/* helper macro to fixup start/end slice values */
9143#define ADJUST_INDICES(start, end, len) \
9144 if (end > len) \
9145 end = len; \
9146 else if (end < 0) { \
9147 end += len; \
9148 if (end < 0) \
9149 end = 0; \
9150 } \
9151 if (start < 0) { \
9152 start += len; \
9153 if (start < 0) \
9154 start = 0; \
9155 }
9156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009158any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 Py_ssize_t start,
9160 Py_ssize_t end)
9161{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009162 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 void *buf1, *buf2;
9164 Py_ssize_t len1, len2, result;
9165
9166 kind1 = PyUnicode_KIND(s1);
9167 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009168 if (kind1 < kind2)
9169 return -1;
9170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 len1 = PyUnicode_GET_LENGTH(s1);
9172 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009173 ADJUST_INDICES(start, end, len1);
9174 if (end - start < len2)
9175 return -1;
9176
9177 buf1 = PyUnicode_DATA(s1);
9178 buf2 = PyUnicode_DATA(s2);
9179 if (len2 == 1) {
9180 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9181 result = findchar((const char *)buf1 + kind1*start,
9182 kind1, end - start, ch, direction);
9183 if (result == -1)
9184 return -1;
9185 else
9186 return start + result;
9187 }
9188
9189 if (kind2 != kind1) {
9190 buf2 = _PyUnicode_AsKind(s2, kind1);
9191 if (!buf2)
9192 return -2;
9193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194
Victor Stinner794d5672011-10-10 03:21:36 +02009195 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009196 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009197 case PyUnicode_1BYTE_KIND:
9198 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9199 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9200 else
9201 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9202 break;
9203 case PyUnicode_2BYTE_KIND:
9204 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9205 break;
9206 case PyUnicode_4BYTE_KIND:
9207 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9208 break;
9209 default:
9210 assert(0); result = -2;
9211 }
9212 }
9213 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009214 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009215 case PyUnicode_1BYTE_KIND:
9216 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9217 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9218 else
9219 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 case PyUnicode_2BYTE_KIND:
9222 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9223 break;
9224 case PyUnicode_4BYTE_KIND:
9225 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9226 break;
9227 default:
9228 assert(0); result = -2;
9229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 }
9231
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009232 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 PyMem_Free(buf2);
9234
9235 return result;
9236}
9237
9238Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009239_PyUnicode_InsertThousandsGrouping(
9240 PyObject *unicode, Py_ssize_t index,
9241 Py_ssize_t n_buffer,
9242 void *digits, Py_ssize_t n_digits,
9243 Py_ssize_t min_width,
9244 const char *grouping, PyObject *thousands_sep,
9245 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246{
Victor Stinner41a863c2012-02-24 00:37:51 +01009247 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009248 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009249 Py_ssize_t thousands_sep_len;
9250 Py_ssize_t len;
9251
9252 if (unicode != NULL) {
9253 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009254 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 }
9256 else {
9257 kind = PyUnicode_1BYTE_KIND;
9258 data = NULL;
9259 }
9260 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9261 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9262 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9263 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009264 if (thousands_sep_kind < kind) {
9265 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9266 if (!thousands_sep_data)
9267 return -1;
9268 }
9269 else {
9270 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9271 if (!data)
9272 return -1;
9273 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009274 }
9275
Benjamin Petersonead6b532011-12-20 17:23:42 -06009276 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009278 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009279 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009280 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009281 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009282 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009283 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009284 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009285 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009286 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009287 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009288 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009290 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009291 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009292 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009293 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009294 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009296 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009297 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009298 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009299 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009300 break;
9301 default:
9302 assert(0);
9303 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009305 if (unicode != NULL && thousands_sep_kind != kind) {
9306 if (thousands_sep_kind < kind)
9307 PyMem_Free(thousands_sep_data);
9308 else
9309 PyMem_Free(data);
9310 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009311 if (unicode == NULL) {
9312 *maxchar = 127;
9313 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009314 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009315 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 }
9317 }
9318 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319}
9320
9321
Alexander Belopolsky40018472011-02-26 01:02:56 +00009322Py_ssize_t
9323PyUnicode_Count(PyObject *str,
9324 PyObject *substr,
9325 Py_ssize_t start,
9326 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009328 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009329 PyObject* str_obj;
9330 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009331 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 void *buf1 = NULL, *buf2 = NULL;
9333 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009334
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009335 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009336 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009338 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009339 if (!sub_obj) {
9340 Py_DECREF(str_obj);
9341 return -1;
9342 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009343 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009344 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 Py_DECREF(str_obj);
9346 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 }
Tim Petersced69f82003-09-16 20:30:58 +00009348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 kind1 = PyUnicode_KIND(str_obj);
9350 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009351 if (kind1 < kind2) {
9352 Py_DECREF(sub_obj);
9353 Py_DECREF(str_obj);
9354 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009355 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 len1 = PyUnicode_GET_LENGTH(str_obj);
9358 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009360 if (end - start < len2) {
9361 Py_DECREF(sub_obj);
9362 Py_DECREF(str_obj);
9363 return 0;
9364 }
9365
9366 buf1 = PyUnicode_DATA(str_obj);
9367 buf2 = PyUnicode_DATA(sub_obj);
9368 if (kind2 != kind1) {
9369 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9370 if (!buf2)
9371 goto onError;
9372 }
9373
9374 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009376 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9377 result = asciilib_count(
9378 ((Py_UCS1*)buf1) + start, end - start,
9379 buf2, len2, PY_SSIZE_T_MAX
9380 );
9381 else
9382 result = ucs1lib_count(
9383 ((Py_UCS1*)buf1) + start, end - start,
9384 buf2, len2, PY_SSIZE_T_MAX
9385 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 break;
9387 case PyUnicode_2BYTE_KIND:
9388 result = ucs2lib_count(
9389 ((Py_UCS2*)buf1) + start, end - start,
9390 buf2, len2, PY_SSIZE_T_MAX
9391 );
9392 break;
9393 case PyUnicode_4BYTE_KIND:
9394 result = ucs4lib_count(
9395 ((Py_UCS4*)buf1) + start, end - start,
9396 buf2, len2, PY_SSIZE_T_MAX
9397 );
9398 break;
9399 default:
9400 assert(0); result = 0;
9401 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009402
9403 Py_DECREF(sub_obj);
9404 Py_DECREF(str_obj);
9405
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009406 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 PyMem_Free(buf2);
9408
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 onError:
9411 Py_DECREF(sub_obj);
9412 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 PyMem_Free(buf2);
9415 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416}
9417
Alexander Belopolsky40018472011-02-26 01:02:56 +00009418Py_ssize_t
9419PyUnicode_Find(PyObject *str,
9420 PyObject *sub,
9421 Py_ssize_t start,
9422 Py_ssize_t end,
9423 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009425 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009426
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009428 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009430 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009431 if (!sub) {
9432 Py_DECREF(str);
9433 return -2;
9434 }
9435 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9436 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 Py_DECREF(str);
9438 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 }
Tim Petersced69f82003-09-16 20:30:58 +00009440
Victor Stinner794d5672011-10-10 03:21:36 +02009441 result = any_find_slice(direction,
9442 str, sub, start, end
9443 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009444
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009446 Py_DECREF(sub);
9447
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 return result;
9449}
9450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451Py_ssize_t
9452PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9453 Py_ssize_t start, Py_ssize_t end,
9454 int direction)
9455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009457 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 if (PyUnicode_READY(str) == -1)
9459 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009460 if (start < 0 || end < 0) {
9461 PyErr_SetString(PyExc_IndexError, "string index out of range");
9462 return -2;
9463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 if (end > PyUnicode_GET_LENGTH(str))
9465 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009466 if (start >= end)
9467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009469 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9470 kind, end-start, ch, direction);
9471 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009473 else
9474 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475}
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009478tailmatch(PyObject *self,
9479 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009480 Py_ssize_t start,
9481 Py_ssize_t end,
9482 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 int kind_self;
9485 int kind_sub;
9486 void *data_self;
9487 void *data_sub;
9488 Py_ssize_t offset;
9489 Py_ssize_t i;
9490 Py_ssize_t end_sub;
9491
9492 if (PyUnicode_READY(self) == -1 ||
9493 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009494 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9497 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009501 if (PyUnicode_GET_LENGTH(substring) == 0)
9502 return 1;
9503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 kind_self = PyUnicode_KIND(self);
9505 data_self = PyUnicode_DATA(self);
9506 kind_sub = PyUnicode_KIND(substring);
9507 data_sub = PyUnicode_DATA(substring);
9508 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9509
9510 if (direction > 0)
9511 offset = end;
9512 else
9513 offset = start;
9514
9515 if (PyUnicode_READ(kind_self, data_self, offset) ==
9516 PyUnicode_READ(kind_sub, data_sub, 0) &&
9517 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9518 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9519 /* If both are of the same kind, memcmp is sufficient */
9520 if (kind_self == kind_sub) {
9521 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009522 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 data_sub,
9524 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009525 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 }
9527 /* otherwise we have to compare each character by first accesing it */
9528 else {
9529 /* We do not need to compare 0 and len(substring)-1 because
9530 the if statement above ensured already that they are equal
9531 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 for (i = 1; i < end_sub; ++i) {
9533 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9534 PyUnicode_READ(kind_sub, data_sub, i))
9535 return 0;
9536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
9540
9541 return 0;
9542}
9543
Alexander Belopolsky40018472011-02-26 01:02:56 +00009544Py_ssize_t
9545PyUnicode_Tailmatch(PyObject *str,
9546 PyObject *substr,
9547 Py_ssize_t start,
9548 Py_ssize_t end,
9549 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009551 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009552
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553 str = PyUnicode_FromObject(str);
9554 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 substr = PyUnicode_FromObject(substr);
9557 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 Py_DECREF(str);
9559 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 }
Tim Petersced69f82003-09-16 20:30:58 +00009561
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009562 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 Py_DECREF(str);
9565 Py_DECREF(substr);
9566 return result;
9567}
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569/* Apply fixfct filter to the Unicode object self and return a
9570 reference to the modified object */
9571
Alexander Belopolsky40018472011-02-26 01:02:56 +00009572static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009573fixup(PyObject *self,
9574 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 PyObject *u;
9577 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009578 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009580 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009583 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 /* fix functions return the new maximum character in a string,
9586 if the kind of the resulting unicode object does not change,
9587 everything is fine. Otherwise we need to change the string kind
9588 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009589 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009590
9591 if (maxchar_new == 0) {
9592 /* no changes */;
9593 if (PyUnicode_CheckExact(self)) {
9594 Py_DECREF(u);
9595 Py_INCREF(self);
9596 return self;
9597 }
9598 else
9599 return u;
9600 }
9601
Victor Stinnere6abb482012-05-02 01:15:40 +02009602 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603
Victor Stinnereaab6042011-12-11 22:22:39 +01009604 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009606
9607 /* In case the maximum character changed, we need to
9608 convert the string to the new category. */
9609 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9610 if (v == NULL) {
9611 Py_DECREF(u);
9612 return NULL;
9613 }
9614 if (maxchar_new > maxchar_old) {
9615 /* If the maxchar increased so that the kind changed, not all
9616 characters are representable anymore and we need to fix the
9617 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009618 _PyUnicode_FastCopyCharacters(v, 0,
9619 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009620 maxchar_old = fixfct(v);
9621 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 }
9623 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009624 _PyUnicode_FastCopyCharacters(v, 0,
9625 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009627 Py_DECREF(u);
9628 assert(_PyUnicode_CheckConsistency(v, 1));
9629 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630}
9631
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632static PyObject *
9633ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9636 char *resdata, *data = PyUnicode_DATA(self);
9637 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009638
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009639 res = PyUnicode_New(len, 127);
9640 if (res == NULL)
9641 return NULL;
9642 resdata = PyUnicode_DATA(res);
9643 if (lower)
9644 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646 _Py_bytes_upper(resdata, data, len);
9647 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653 Py_ssize_t j;
9654 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009655 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009656 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009657
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9659
9660 where ! is a negation and \p{xxx} is a character with property xxx.
9661 */
9662 for (j = i - 1; j >= 0; j--) {
9663 c = PyUnicode_READ(kind, data, j);
9664 if (!_PyUnicode_IsCaseIgnorable(c))
9665 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9668 if (final_sigma) {
9669 for (j = i + 1; j < length; j++) {
9670 c = PyUnicode_READ(kind, data, j);
9671 if (!_PyUnicode_IsCaseIgnorable(c))
9672 break;
9673 }
9674 final_sigma = j == length || !_PyUnicode_IsCased(c);
9675 }
9676 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679static int
9680lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9681 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 /* Obscure special case. */
9684 if (c == 0x3A3) {
9685 mapped[0] = handle_capital_sigma(kind, data, length, i);
9686 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689}
9690
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691static Py_ssize_t
9692do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 Py_ssize_t i, k = 0;
9695 int n_res, j;
9696 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009697
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698 c = PyUnicode_READ(kind, data, 0);
9699 n_res = _PyUnicode_ToUpperFull(c, mapped);
9700 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009701 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 for (i = 1; i < length; i++) {
9705 c = PyUnicode_READ(kind, data, i);
9706 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9707 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009708 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009710 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009711 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713}
9714
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715static Py_ssize_t
9716do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9717 Py_ssize_t i, k = 0;
9718
9719 for (i = 0; i < length; i++) {
9720 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9721 int n_res, j;
9722 if (Py_UNICODE_ISUPPER(c)) {
9723 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9724 }
9725 else if (Py_UNICODE_ISLOWER(c)) {
9726 n_res = _PyUnicode_ToUpperFull(c, mapped);
9727 }
9728 else {
9729 n_res = 1;
9730 mapped[0] = c;
9731 }
9732 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009733 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 res[k++] = mapped[j];
9735 }
9736 }
9737 return k;
9738}
9739
9740static Py_ssize_t
9741do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9742 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744 Py_ssize_t i, k = 0;
9745
9746 for (i = 0; i < length; i++) {
9747 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9748 int n_res, j;
9749 if (lower)
9750 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9751 else
9752 n_res = _PyUnicode_ToUpperFull(c, mapped);
9753 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009754 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 res[k++] = mapped[j];
9756 }
9757 }
9758 return k;
9759}
9760
9761static Py_ssize_t
9762do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9763{
9764 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9765}
9766
9767static Py_ssize_t
9768do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9769{
9770 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9771}
9772
Benjamin Petersone51757f2012-01-12 21:10:29 -05009773static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009774do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9775{
9776 Py_ssize_t i, k = 0;
9777
9778 for (i = 0; i < length; i++) {
9779 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9780 Py_UCS4 mapped[3];
9781 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9782 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009783 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009784 res[k++] = mapped[j];
9785 }
9786 }
9787 return k;
9788}
9789
9790static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009791do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 Py_ssize_t i, k = 0;
9794 int previous_is_cased;
9795
9796 previous_is_cased = 0;
9797 for (i = 0; i < length; i++) {
9798 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9799 Py_UCS4 mapped[3];
9800 int n_res, j;
9801
9802 if (previous_is_cased)
9803 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9804 else
9805 n_res = _PyUnicode_ToTitleFull(c, mapped);
9806
9807 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009808 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009809 res[k++] = mapped[j];
9810 }
9811
9812 previous_is_cased = _PyUnicode_IsCased(c);
9813 }
9814 return k;
9815}
9816
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009817static PyObject *
9818case_operation(PyObject *self,
9819 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9820{
9821 PyObject *res = NULL;
9822 Py_ssize_t length, newlength = 0;
9823 int kind, outkind;
9824 void *data, *outdata;
9825 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9826
Benjamin Petersoneea48462012-01-16 14:28:50 -05009827 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828
9829 kind = PyUnicode_KIND(self);
9830 data = PyUnicode_DATA(self);
9831 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009832 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009833 PyErr_SetString(PyExc_OverflowError, "string is too long");
9834 return NULL;
9835 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009836 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 if (tmp == NULL)
9838 return PyErr_NoMemory();
9839 newlength = perform(kind, data, length, tmp, &maxchar);
9840 res = PyUnicode_New(newlength, maxchar);
9841 if (res == NULL)
9842 goto leave;
9843 tmpend = tmp + newlength;
9844 outdata = PyUnicode_DATA(res);
9845 outkind = PyUnicode_KIND(res);
9846 switch (outkind) {
9847 case PyUnicode_1BYTE_KIND:
9848 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9849 break;
9850 case PyUnicode_2BYTE_KIND:
9851 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9852 break;
9853 case PyUnicode_4BYTE_KIND:
9854 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9855 break;
9856 default:
9857 assert(0);
9858 break;
9859 }
9860 leave:
9861 PyMem_FREE(tmp);
9862 return res;
9863}
9864
Tim Peters8ce9f162004-08-27 01:49:32 +00009865PyObject *
9866PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009869 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009871 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009872 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9873 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009874 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009876 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009878 int use_memcpy;
9879 unsigned char *res_data = NULL, *sep_data = NULL;
9880 PyObject *last_obj;
9881 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009883 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009884 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009885 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009886 }
9887
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009888 /* NOTE: the following code can't call back into Python code,
9889 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009890 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009891
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 seqlen = PySequence_Fast_GET_SIZE(fseq);
9893 /* If empty sequence, return u"". */
9894 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009895 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009896 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009897 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009898
Tim Peters05eba1f2004-08-27 21:32:02 +00009899 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009900 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009901 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009902 if (seqlen == 1) {
9903 if (PyUnicode_CheckExact(items[0])) {
9904 res = items[0];
9905 Py_INCREF(res);
9906 Py_DECREF(fseq);
9907 return res;
9908 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009909 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009910 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009911 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009913 /* Set up sep and seplen */
9914 if (separator == NULL) {
9915 /* fall back to a blank space separator */
9916 sep = PyUnicode_FromOrdinal(' ');
9917 if (!sep)
9918 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009919 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009920 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009921 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009922 else {
9923 if (!PyUnicode_Check(separator)) {
9924 PyErr_Format(PyExc_TypeError,
9925 "separator: expected str instance,"
9926 " %.80s found",
9927 Py_TYPE(separator)->tp_name);
9928 goto onError;
9929 }
9930 if (PyUnicode_READY(separator))
9931 goto onError;
9932 sep = separator;
9933 seplen = PyUnicode_GET_LENGTH(separator);
9934 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9935 /* inc refcount to keep this code path symmetric with the
9936 above case of a blank separator */
9937 Py_INCREF(sep);
9938 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009939 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009940 }
9941
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009942 /* There are at least two things to join, or else we have a subclass
9943 * of str in the sequence.
9944 * Do a pre-pass to figure out the total amount of space we'll
9945 * need (sz), and see whether all argument are strings.
9946 */
9947 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948#ifdef Py_DEBUG
9949 use_memcpy = 0;
9950#else
9951 use_memcpy = 1;
9952#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009953 for (i = 0; i < seqlen; i++) {
9954 const Py_ssize_t old_sz = sz;
9955 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 if (!PyUnicode_Check(item)) {
9957 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009958 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 " %.80s found",
9960 i, Py_TYPE(item)->tp_name);
9961 goto onError;
9962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 if (PyUnicode_READY(item) == -1)
9964 goto onError;
9965 sz += PyUnicode_GET_LENGTH(item);
9966 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009967 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 if (i != 0)
9969 sz += seplen;
9970 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9971 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009972 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 goto onError;
9974 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 if (use_memcpy && last_obj != NULL) {
9976 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9977 use_memcpy = 0;
9978 }
9979 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 }
Tim Petersced69f82003-09-16 20:30:58 +00009981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009983 if (res == NULL)
9984 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009985
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009986 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009987#ifdef Py_DEBUG
9988 use_memcpy = 0;
9989#else
9990 if (use_memcpy) {
9991 res_data = PyUnicode_1BYTE_DATA(res);
9992 kind = PyUnicode_KIND(res);
9993 if (seplen != 0)
9994 sep_data = PyUnicode_1BYTE_DATA(sep);
9995 }
9996#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009997 if (use_memcpy) {
9998 for (i = 0; i < seqlen; ++i) {
9999 Py_ssize_t itemlen;
10000 item = items[i];
10001
10002 /* Copy item, and maybe the separator. */
10003 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 Py_MEMCPY(res_data,
10005 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010006 kind * seplen);
10007 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010009
10010 itemlen = PyUnicode_GET_LENGTH(item);
10011 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010012 Py_MEMCPY(res_data,
10013 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010014 kind * itemlen);
10015 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010016 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010017 }
10018 assert(res_data == PyUnicode_1BYTE_DATA(res)
10019 + kind * PyUnicode_GET_LENGTH(res));
10020 }
10021 else {
10022 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10023 Py_ssize_t itemlen;
10024 item = items[i];
10025
10026 /* Copy item, and maybe the separator. */
10027 if (i && seplen != 0) {
10028 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10029 res_offset += seplen;
10030 }
10031
10032 itemlen = PyUnicode_GET_LENGTH(item);
10033 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010034 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010035 res_offset += itemlen;
10036 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010037 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010039 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010040
Tim Peters05eba1f2004-08-27 21:32:02 +000010041 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010043 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045
Benjamin Peterson29060642009-01-31 22:14:21 +000010046 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010047 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010049 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050 return NULL;
10051}
10052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053#define FILL(kind, data, value, start, length) \
10054 do { \
10055 Py_ssize_t i_ = 0; \
10056 assert(kind != PyUnicode_WCHAR_KIND); \
10057 switch ((kind)) { \
10058 case PyUnicode_1BYTE_KIND: { \
10059 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010060 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 break; \
10062 } \
10063 case PyUnicode_2BYTE_KIND: { \
10064 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10065 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10066 break; \
10067 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010068 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10070 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10071 break; \
10072 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010073 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 } \
10075 } while (0)
10076
Victor Stinnerd3f08822012-05-29 12:57:52 +020010077void
10078_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10079 Py_UCS4 fill_char)
10080{
10081 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10082 const void *data = PyUnicode_DATA(unicode);
10083 assert(PyUnicode_IS_READY(unicode));
10084 assert(unicode_modifiable(unicode));
10085 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10086 assert(start >= 0);
10087 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10088 FILL(kind, data, fill_char, start, length);
10089}
10090
Victor Stinner3fe55312012-01-04 00:33:50 +010010091Py_ssize_t
10092PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10093 Py_UCS4 fill_char)
10094{
10095 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010096
10097 if (!PyUnicode_Check(unicode)) {
10098 PyErr_BadInternalCall();
10099 return -1;
10100 }
10101 if (PyUnicode_READY(unicode) == -1)
10102 return -1;
10103 if (unicode_check_modifiable(unicode))
10104 return -1;
10105
Victor Stinnerd3f08822012-05-29 12:57:52 +020010106 if (start < 0) {
10107 PyErr_SetString(PyExc_IndexError, "string index out of range");
10108 return -1;
10109 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010110 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10111 PyErr_SetString(PyExc_ValueError,
10112 "fill character is bigger than "
10113 "the string maximum character");
10114 return -1;
10115 }
10116
10117 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10118 length = Py_MIN(maxlen, length);
10119 if (length <= 0)
10120 return 0;
10121
Victor Stinnerd3f08822012-05-29 12:57:52 +020010122 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010123 return length;
10124}
10125
Victor Stinner9310abb2011-10-05 00:59:23 +020010126static PyObject *
10127pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010128 Py_ssize_t left,
10129 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 PyObject *u;
10133 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010134 int kind;
10135 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136
10137 if (left < 0)
10138 left = 0;
10139 if (right < 0)
10140 right = 0;
10141
Victor Stinnerc4b49542011-12-11 22:44:26 +010010142 if (left == 0 && right == 0)
10143 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10146 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010147 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10148 return NULL;
10149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010151 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010153 if (!u)
10154 return NULL;
10155
10156 kind = PyUnicode_KIND(u);
10157 data = PyUnicode_DATA(u);
10158 if (left)
10159 FILL(kind, data, fill, 0, left);
10160 if (right)
10161 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010162 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010163 assert(_PyUnicode_CheckConsistency(u, 1));
10164 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165}
10166
Alexander Belopolsky40018472011-02-26 01:02:56 +000010167PyObject *
10168PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
10172 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010173 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010175 if (PyUnicode_READY(string) == -1) {
10176 Py_DECREF(string);
10177 return NULL;
10178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179
Benjamin Petersonead6b532011-12-20 17:23:42 -060010180 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 if (PyUnicode_IS_ASCII(string))
10183 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 PyUnicode_GET_LENGTH(string), keepends);
10186 else
10187 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 break;
10191 case PyUnicode_2BYTE_KIND:
10192 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 PyUnicode_GET_LENGTH(string), keepends);
10195 break;
10196 case PyUnicode_4BYTE_KIND:
10197 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 PyUnicode_GET_LENGTH(string), keepends);
10200 break;
10201 default:
10202 assert(0);
10203 list = 0;
10204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205 Py_DECREF(string);
10206 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207}
10208
Alexander Belopolsky40018472011-02-26 01:02:56 +000010209static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010210split(PyObject *self,
10211 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010212 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010214 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 void *buf1, *buf2;
10216 Py_ssize_t len1, len2;
10217 PyObject* out;
10218
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010220 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (PyUnicode_READY(self) == -1)
10223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010226 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 if (PyUnicode_IS_ASCII(self))
10229 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 PyUnicode_GET_LENGTH(self), maxcount
10232 );
10233 else
10234 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010235 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 PyUnicode_GET_LENGTH(self), maxcount
10237 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 case PyUnicode_2BYTE_KIND:
10239 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010240 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 PyUnicode_GET_LENGTH(self), maxcount
10242 );
10243 case PyUnicode_4BYTE_KIND:
10244 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyUnicode_GET_LENGTH(self), maxcount
10247 );
10248 default:
10249 assert(0);
10250 return NULL;
10251 }
10252
10253 if (PyUnicode_READY(substring) == -1)
10254 return NULL;
10255
10256 kind1 = PyUnicode_KIND(self);
10257 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 len1 = PyUnicode_GET_LENGTH(self);
10259 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010260 if (kind1 < kind2 || len1 < len2) {
10261 out = PyList_New(1);
10262 if (out == NULL)
10263 return NULL;
10264 Py_INCREF(self);
10265 PyList_SET_ITEM(out, 0, self);
10266 return out;
10267 }
10268 buf1 = PyUnicode_DATA(self);
10269 buf2 = PyUnicode_DATA(substring);
10270 if (kind2 != kind1) {
10271 buf2 = _PyUnicode_AsKind(substring, kind1);
10272 if (!buf2)
10273 return NULL;
10274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010276 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10279 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 else
10282 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 break;
10285 case PyUnicode_2BYTE_KIND:
10286 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010287 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 break;
10289 case PyUnicode_4BYTE_KIND:
10290 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 default:
10294 out = NULL;
10295 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010296 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 PyMem_Free(buf2);
10298 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299}
10300
Alexander Belopolsky40018472011-02-26 01:02:56 +000010301static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010302rsplit(PyObject *self,
10303 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010304 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010305{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010306 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 void *buf1, *buf2;
10308 Py_ssize_t len1, len2;
10309 PyObject* out;
10310
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010311 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010312 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (PyUnicode_READY(self) == -1)
10315 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010318 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010320 if (PyUnicode_IS_ASCII(self))
10321 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010322 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 PyUnicode_GET_LENGTH(self), maxcount
10324 );
10325 else
10326 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 PyUnicode_GET_LENGTH(self), maxcount
10329 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 case PyUnicode_2BYTE_KIND:
10331 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 PyUnicode_GET_LENGTH(self), maxcount
10334 );
10335 case PyUnicode_4BYTE_KIND:
10336 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyUnicode_GET_LENGTH(self), maxcount
10339 );
10340 default:
10341 assert(0);
10342 return NULL;
10343 }
10344
10345 if (PyUnicode_READY(substring) == -1)
10346 return NULL;
10347
10348 kind1 = PyUnicode_KIND(self);
10349 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 len1 = PyUnicode_GET_LENGTH(self);
10351 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010352 if (kind1 < kind2 || len1 < len2) {
10353 out = PyList_New(1);
10354 if (out == NULL)
10355 return NULL;
10356 Py_INCREF(self);
10357 PyList_SET_ITEM(out, 0, self);
10358 return out;
10359 }
10360 buf1 = PyUnicode_DATA(self);
10361 buf2 = PyUnicode_DATA(substring);
10362 if (kind2 != kind1) {
10363 buf2 = _PyUnicode_AsKind(substring, kind1);
10364 if (!buf2)
10365 return NULL;
10366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010368 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10371 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010372 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 else
10374 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 break;
10377 case PyUnicode_2BYTE_KIND:
10378 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 break;
10381 case PyUnicode_4BYTE_KIND:
10382 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 break;
10385 default:
10386 out = NULL;
10387 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010388 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 PyMem_Free(buf2);
10390 return out;
10391}
10392
10393static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10395 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010397 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10400 return asciilib_find(buf1, len1, buf2, len2, offset);
10401 else
10402 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 case PyUnicode_2BYTE_KIND:
10404 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10405 case PyUnicode_4BYTE_KIND:
10406 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10407 }
10408 assert(0);
10409 return -1;
10410}
10411
10412static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010413anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10414 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010416 switch (kind) {
10417 case PyUnicode_1BYTE_KIND:
10418 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10419 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10420 else
10421 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10422 case PyUnicode_2BYTE_KIND:
10423 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10424 case PyUnicode_4BYTE_KIND:
10425 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10426 }
10427 assert(0);
10428 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010429}
10430
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010431static void
10432replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10433 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10434{
10435 int kind = PyUnicode_KIND(u);
10436 void *data = PyUnicode_DATA(u);
10437 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10438 if (kind == PyUnicode_1BYTE_KIND) {
10439 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10440 (Py_UCS1 *)data + len,
10441 u1, u2, maxcount);
10442 }
10443 else if (kind == PyUnicode_2BYTE_KIND) {
10444 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10445 (Py_UCS2 *)data + len,
10446 u1, u2, maxcount);
10447 }
10448 else {
10449 assert(kind == PyUnicode_4BYTE_KIND);
10450 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10451 (Py_UCS4 *)data + len,
10452 u1, u2, maxcount);
10453 }
10454}
10455
Alexander Belopolsky40018472011-02-26 01:02:56 +000010456static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457replace(PyObject *self, PyObject *str1,
10458 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 PyObject *u;
10461 char *sbuf = PyUnicode_DATA(self);
10462 char *buf1 = PyUnicode_DATA(str1);
10463 char *buf2 = PyUnicode_DATA(str2);
10464 int srelease = 0, release1 = 0, release2 = 0;
10465 int skind = PyUnicode_KIND(self);
10466 int kind1 = PyUnicode_KIND(str1);
10467 int kind2 = PyUnicode_KIND(str2);
10468 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10469 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10470 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010471 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010472 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473
10474 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010477 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478
Victor Stinner59de0ee2011-10-07 10:01:28 +020010479 if (str1 == str2)
10480 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481
Victor Stinner49a0a212011-10-12 23:46:10 +020010482 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010483 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10484 if (maxchar < maxchar_str1)
10485 /* substring too wide to be present */
10486 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010487 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10488 /* Replacing str1 with str2 may cause a maxchar reduction in the
10489 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010491 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010496 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010499 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010500 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010501
Victor Stinner69ed0f42013-04-09 21:48:24 +020010502 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010503 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010504 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010505 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010506 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010510
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010511 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10512 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010513 }
10514 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 int rkind = skind;
10516 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010517 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (kind1 < rkind) {
10520 /* widen substring */
10521 buf1 = _PyUnicode_AsKind(str1, rkind);
10522 if (!buf1) goto error;
10523 release1 = 1;
10524 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010525 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 if (i < 0)
10527 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (rkind > kind2) {
10529 /* widen replacement */
10530 buf2 = _PyUnicode_AsKind(str2, rkind);
10531 if (!buf2) goto error;
10532 release2 = 1;
10533 }
10534 else if (rkind < kind2) {
10535 /* widen self and buf1 */
10536 rkind = kind2;
10537 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010538 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 sbuf = _PyUnicode_AsKind(self, rkind);
10540 if (!sbuf) goto error;
10541 srelease = 1;
10542 buf1 = _PyUnicode_AsKind(str1, rkind);
10543 if (!buf1) goto error;
10544 release1 = 1;
10545 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 u = PyUnicode_New(slen, maxchar);
10547 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010549 assert(PyUnicode_KIND(u) == rkind);
10550 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010551
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010552 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010553 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010554 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010556 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010558
10559 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010560 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010562 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010563 if (i == -1)
10564 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010565 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010567 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 }
10572 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010574 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 int rkind = skind;
10576 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 buf1 = _PyUnicode_AsKind(str1, rkind);
10581 if (!buf1) goto error;
10582 release1 = 1;
10583 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010584 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010585 if (n == 0)
10586 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 buf2 = _PyUnicode_AsKind(str2, rkind);
10590 if (!buf2) goto error;
10591 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 rkind = kind2;
10596 sbuf = _PyUnicode_AsKind(self, rkind);
10597 if (!sbuf) goto error;
10598 srelease = 1;
10599 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010600 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 buf1 = _PyUnicode_AsKind(str1, rkind);
10602 if (!buf1) goto error;
10603 release1 = 1;
10604 }
10605 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10606 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010607 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 PyErr_SetString(PyExc_OverflowError,
10609 "replace string is too long");
10610 goto error;
10611 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010612 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010614 _Py_INCREF_UNICODE_EMPTY();
10615 if (!unicode_empty)
10616 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 u = unicode_empty;
10618 goto done;
10619 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010620 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 PyErr_SetString(PyExc_OverflowError,
10622 "replace string is too long");
10623 goto error;
10624 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 u = PyUnicode_New(new_size, maxchar);
10626 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010628 assert(PyUnicode_KIND(u) == rkind);
10629 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 ires = i = 0;
10631 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 while (n-- > 0) {
10633 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010634 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010636 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010637 if (j == -1)
10638 break;
10639 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010641 memcpy(res + rkind * ires,
10642 sbuf + rkind * i,
10643 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 }
10646 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010650 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 memcpy(res + rkind * ires,
10658 sbuf + rkind * i,
10659 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010660 }
10661 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 /* interleave */
10663 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 if (--n <= 0)
10669 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010670 memcpy(res + rkind * ires,
10671 sbuf + rkind * i,
10672 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 ires++;
10674 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 memcpy(res + rkind * ires,
10677 sbuf + rkind * i,
10678 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010680 }
10681
10682 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010683 unicode_adjust_maxchar(&u);
10684 if (u == NULL)
10685 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010687
10688 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (srelease)
10690 PyMem_FREE(sbuf);
10691 if (release1)
10692 PyMem_FREE(buf1);
10693 if (release2)
10694 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010695 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697
Benjamin Peterson29060642009-01-31 22:14:21 +000010698 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (srelease)
10701 PyMem_FREE(sbuf);
10702 if (release1)
10703 PyMem_FREE(buf1);
10704 if (release2)
10705 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010706 return unicode_result_unchanged(self);
10707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 error:
10709 if (srelease && sbuf)
10710 PyMem_FREE(sbuf);
10711 if (release1 && buf1)
10712 PyMem_FREE(buf1);
10713 if (release2 && buf2)
10714 PyMem_FREE(buf2);
10715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716}
10717
10718/* --- Unicode Object Methods --------------------------------------------- */
10719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010720PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722\n\
10723Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725
10726static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010727unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010729 if (PyUnicode_READY(self) == -1)
10730 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010731 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732}
10733
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010734PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010735 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736\n\
10737Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010738have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
10740static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010741unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010743 if (PyUnicode_READY(self) == -1)
10744 return NULL;
10745 if (PyUnicode_GET_LENGTH(self) == 0)
10746 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010747 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748}
10749
Benjamin Petersond5890c82012-01-14 13:23:30 -050010750PyDoc_STRVAR(casefold__doc__,
10751 "S.casefold() -> str\n\
10752\n\
10753Return a version of S suitable for caseless comparisons.");
10754
10755static PyObject *
10756unicode_casefold(PyObject *self)
10757{
10758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
10760 if (PyUnicode_IS_ASCII(self))
10761 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010762 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010763}
10764
10765
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010766/* Argument converter. Coerces to a single unicode character */
10767
10768static int
10769convert_uc(PyObject *obj, void *addr)
10770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010772 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010773
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 uniobj = PyUnicode_FromObject(obj);
10775 if (uniobj == NULL) {
10776 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010778 return 0;
10779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010781 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010783 Py_DECREF(uniobj);
10784 return 0;
10785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 Py_DECREF(uniobj);
10788 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010789}
10790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010791PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010794Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796
10797static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010798unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010800 Py_ssize_t marg, left;
10801 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 Py_UCS4 fillchar = ' ';
10803
Victor Stinnere9a29352011-10-01 02:14:59 +020010804 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806
Benjamin Petersonbac79492012-01-14 13:34:47 -050010807 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808 return NULL;
10809
Victor Stinnerc4b49542011-12-11 22:44:26 +010010810 if (PyUnicode_GET_LENGTH(self) >= width)
10811 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
Victor Stinnerc4b49542011-12-11 22:44:26 +010010813 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 left = marg / 2 + (marg & width & 1);
10815
Victor Stinner9310abb2011-10-05 00:59:23 +020010816 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817}
10818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819/* This function assumes that str1 and str2 are readied by the caller. */
10820
Marc-André Lemburge5034372000-08-08 08:04:29 +000010821static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010822unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010823{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010824#define COMPARE(TYPE1, TYPE2) \
10825 do { \
10826 TYPE1* p1 = (TYPE1 *)data1; \
10827 TYPE2* p2 = (TYPE2 *)data2; \
10828 TYPE1* end = p1 + len; \
10829 Py_UCS4 c1, c2; \
10830 for (; p1 != end; p1++, p2++) { \
10831 c1 = *p1; \
10832 c2 = *p2; \
10833 if (c1 != c2) \
10834 return (c1 < c2) ? -1 : 1; \
10835 } \
10836 } \
10837 while (0)
10838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 int kind1, kind2;
10840 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010841 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 kind1 = PyUnicode_KIND(str1);
10844 kind2 = PyUnicode_KIND(str2);
10845 data1 = PyUnicode_DATA(str1);
10846 data2 = PyUnicode_DATA(str2);
10847 len1 = PyUnicode_GET_LENGTH(str1);
10848 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010849 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010851 switch(kind1) {
10852 case PyUnicode_1BYTE_KIND:
10853 {
10854 switch(kind2) {
10855 case PyUnicode_1BYTE_KIND:
10856 {
10857 int cmp = memcmp(data1, data2, len);
10858 /* normalize result of memcmp() into the range [-1; 1] */
10859 if (cmp < 0)
10860 return -1;
10861 if (cmp > 0)
10862 return 1;
10863 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010864 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010865 case PyUnicode_2BYTE_KIND:
10866 COMPARE(Py_UCS1, Py_UCS2);
10867 break;
10868 case PyUnicode_4BYTE_KIND:
10869 COMPARE(Py_UCS1, Py_UCS4);
10870 break;
10871 default:
10872 assert(0);
10873 }
10874 break;
10875 }
10876 case PyUnicode_2BYTE_KIND:
10877 {
10878 switch(kind2) {
10879 case PyUnicode_1BYTE_KIND:
10880 COMPARE(Py_UCS2, Py_UCS1);
10881 break;
10882 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010883 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010884 COMPARE(Py_UCS2, Py_UCS2);
10885 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010886 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 case PyUnicode_4BYTE_KIND:
10888 COMPARE(Py_UCS2, Py_UCS4);
10889 break;
10890 default:
10891 assert(0);
10892 }
10893 break;
10894 }
10895 case PyUnicode_4BYTE_KIND:
10896 {
10897 switch(kind2) {
10898 case PyUnicode_1BYTE_KIND:
10899 COMPARE(Py_UCS4, Py_UCS1);
10900 break;
10901 case PyUnicode_2BYTE_KIND:
10902 COMPARE(Py_UCS4, Py_UCS2);
10903 break;
10904 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010905 {
10906#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10907 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10908 /* normalize result of wmemcmp() into the range [-1; 1] */
10909 if (cmp < 0)
10910 return -1;
10911 if (cmp > 0)
10912 return 1;
10913#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010915#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010916 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010917 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010918 default:
10919 assert(0);
10920 }
10921 break;
10922 }
10923 default:
10924 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010925 }
10926
Victor Stinner770e19e2012-10-04 22:59:45 +020010927 if (len1 == len2)
10928 return 0;
10929 if (len1 < len2)
10930 return -1;
10931 else
10932 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933
10934#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010935}
10936
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010937Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010938unicode_compare_eq(PyObject *str1, PyObject *str2)
10939{
10940 int kind;
10941 void *data1, *data2;
10942 Py_ssize_t len;
10943 int cmp;
10944
Victor Stinnere5567ad2012-10-23 02:48:49 +020010945 len = PyUnicode_GET_LENGTH(str1);
10946 if (PyUnicode_GET_LENGTH(str2) != len)
10947 return 0;
10948 kind = PyUnicode_KIND(str1);
10949 if (PyUnicode_KIND(str2) != kind)
10950 return 0;
10951 data1 = PyUnicode_DATA(str1);
10952 data2 = PyUnicode_DATA(str2);
10953
10954 cmp = memcmp(data1, data2, len * kind);
10955 return (cmp == 0);
10956}
10957
10958
Alexander Belopolsky40018472011-02-26 01:02:56 +000010959int
10960PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10963 if (PyUnicode_READY(left) == -1 ||
10964 PyUnicode_READY(right) == -1)
10965 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010966
10967 /* a string is equal to itself */
10968 if (left == right)
10969 return 0;
10970
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010971 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010973 PyErr_Format(PyExc_TypeError,
10974 "Can't compare %.100s and %.100s",
10975 left->ob_type->tp_name,
10976 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 return -1;
10978}
10979
Martin v. Löwis5b222132007-06-10 09:51:05 +000010980int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010981_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10982{
10983 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10984 if (right_str == NULL)
10985 return -1;
10986 return PyUnicode_Compare(left, right_str);
10987}
10988
10989int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010990PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 Py_ssize_t i;
10993 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 Py_UCS4 chr;
10995
Victor Stinner910337b2011-10-03 03:20:16 +020010996 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 if (PyUnicode_READY(uni) == -1)
10998 return -1;
10999 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011000 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011001 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011002 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011003 size_t len, len2 = strlen(str);
11004 int cmp;
11005
11006 len = Py_MIN(len1, len2);
11007 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011008 if (cmp != 0) {
11009 if (cmp < 0)
11010 return -1;
11011 else
11012 return 1;
11013 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011014 if (len1 > len2)
11015 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011016 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011017 return -1; /* str is longer */
11018 return 0;
11019 }
11020 else {
11021 void *data = PyUnicode_DATA(uni);
11022 /* Compare Unicode string and source character set string */
11023 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011024 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011025 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11026 /* This check keeps Python strings that end in '\0' from comparing equal
11027 to C strings identical up to that point. */
11028 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11029 return 1; /* uni is longer */
11030 if (str[i])
11031 return -1; /* str is longer */
11032 return 0;
11033 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011034}
11035
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011036
Benjamin Peterson29060642009-01-31 22:14:21 +000011037#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011038 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011039
Alexander Belopolsky40018472011-02-26 01:02:56 +000011040PyObject *
11041PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011042{
11043 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011044 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011045
Victor Stinnere5567ad2012-10-23 02:48:49 +020011046 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11047 Py_RETURN_NOTIMPLEMENTED;
11048
11049 if (PyUnicode_READY(left) == -1 ||
11050 PyUnicode_READY(right) == -1)
11051 return NULL;
11052
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011053 if (left == right) {
11054 switch (op) {
11055 case Py_EQ:
11056 case Py_LE:
11057 case Py_GE:
11058 /* a string is equal to itself */
11059 v = Py_True;
11060 break;
11061 case Py_NE:
11062 case Py_LT:
11063 case Py_GT:
11064 v = Py_False;
11065 break;
11066 default:
11067 PyErr_BadArgument();
11068 return NULL;
11069 }
11070 }
11071 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011072 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011073 result ^= (op == Py_NE);
11074 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011075 }
11076 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011077 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011078
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011079 /* Convert the return value to a Boolean */
11080 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011081 case Py_LE:
11082 v = TEST_COND(result <= 0);
11083 break;
11084 case Py_GE:
11085 v = TEST_COND(result >= 0);
11086 break;
11087 case Py_LT:
11088 v = TEST_COND(result == -1);
11089 break;
11090 case Py_GT:
11091 v = TEST_COND(result == 1);
11092 break;
11093 default:
11094 PyErr_BadArgument();
11095 return NULL;
11096 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011097 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011098 Py_INCREF(v);
11099 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011100}
11101
Alexander Belopolsky40018472011-02-26 01:02:56 +000011102int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011103_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11104{
11105 return unicode_eq(aa, bb);
11106}
11107
11108int
Alexander Belopolsky40018472011-02-26 01:02:56 +000011109PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011110{
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020011112 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 void *buf1, *buf2;
11114 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011115 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011116
11117 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000011118 sub = PyUnicode_FromObject(element);
11119 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011120 PyErr_Format(PyExc_TypeError,
11121 "'in <string>' requires string as left operand, not %s",
11122 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011123 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011124 }
11125
Thomas Wouters477c8d52006-05-27 19:21:47 +000011126 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011127 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011128 Py_DECREF(sub);
11129 return -1;
11130 }
11131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 kind1 = PyUnicode_KIND(str);
11133 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011134 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050011136 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011137 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 }
11139 len1 = PyUnicode_GET_LENGTH(str);
11140 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011141 if (len1 < len2) {
11142 Py_DECREF(sub);
11143 Py_DECREF(str);
11144 return 0;
11145 }
11146 buf1 = PyUnicode_DATA(str);
11147 buf2 = PyUnicode_DATA(sub);
11148 if (len2 == 1) {
11149 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11150 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11151 Py_DECREF(sub);
11152 Py_DECREF(str);
11153 return result;
11154 }
11155 if (kind2 != kind1) {
11156 buf2 = _PyUnicode_AsKind(sub, kind1);
11157 if (!buf2) {
11158 Py_DECREF(sub);
11159 Py_DECREF(str);
11160 return -1;
11161 }
11162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163
Victor Stinner77282cb2013-04-14 19:22:47 +020011164 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 case PyUnicode_1BYTE_KIND:
11166 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11167 break;
11168 case PyUnicode_2BYTE_KIND:
11169 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11170 break;
11171 case PyUnicode_4BYTE_KIND:
11172 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11173 break;
11174 default:
11175 result = -1;
11176 assert(0);
11177 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178
11179 Py_DECREF(str);
11180 Py_DECREF(sub);
11181
Victor Stinner77282cb2013-04-14 19:22:47 +020011182 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 PyMem_Free(buf2);
11184
Guido van Rossum403d68b2000-03-13 15:55:09 +000011185 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011186}
11187
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188/* Concat to string or Unicode object giving a new Unicode object. */
11189
Alexander Belopolsky40018472011-02-26 01:02:56 +000011190PyObject *
11191PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011194 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011195 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
11197 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011203 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
11205 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011206 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011207 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011210 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213 }
11214
Victor Stinner488fa492011-12-12 00:01:39 +010011215 u_len = PyUnicode_GET_LENGTH(u);
11216 v_len = PyUnicode_GET_LENGTH(v);
11217 if (u_len > PY_SSIZE_T_MAX - v_len) {
11218 PyErr_SetString(PyExc_OverflowError,
11219 "strings are too large to concat");
11220 goto onError;
11221 }
11222 new_len = u_len + v_len;
11223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011225 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011226 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011229 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011232 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11233 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234 Py_DECREF(u);
11235 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011236 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
Benjamin Peterson29060642009-01-31 22:14:21 +000011239 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 Py_XDECREF(u);
11241 Py_XDECREF(v);
11242 return NULL;
11243}
11244
Walter Dörwald1ab83302007-05-18 17:15:44 +000011245void
Victor Stinner23e56682011-10-03 03:54:37 +020011246PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011247{
Victor Stinner23e56682011-10-03 03:54:37 +020011248 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011249 Py_UCS4 maxchar, maxchar2;
11250 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011251
11252 if (p_left == NULL) {
11253 if (!PyErr_Occurred())
11254 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011255 return;
11256 }
Victor Stinner23e56682011-10-03 03:54:37 +020011257 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011258 if (right == NULL || left == NULL
11259 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011260 if (!PyErr_Occurred())
11261 PyErr_BadInternalCall();
11262 goto error;
11263 }
11264
Benjamin Petersonbac79492012-01-14 13:34:47 -050011265 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011266 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011267 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011268 goto error;
11269
Victor Stinner488fa492011-12-12 00:01:39 +010011270 /* Shortcuts */
11271 if (left == unicode_empty) {
11272 Py_DECREF(left);
11273 Py_INCREF(right);
11274 *p_left = right;
11275 return;
11276 }
11277 if (right == unicode_empty)
11278 return;
11279
11280 left_len = PyUnicode_GET_LENGTH(left);
11281 right_len = PyUnicode_GET_LENGTH(right);
11282 if (left_len > PY_SSIZE_T_MAX - right_len) {
11283 PyErr_SetString(PyExc_OverflowError,
11284 "strings are too large to concat");
11285 goto error;
11286 }
11287 new_len = left_len + right_len;
11288
11289 if (unicode_modifiable(left)
11290 && PyUnicode_CheckExact(right)
11291 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011292 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11293 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011294 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011295 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011296 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11297 {
11298 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011299 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011300 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011301
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011302 /* copy 'right' into the newly allocated area of 'left' */
11303 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011304 }
Victor Stinner488fa492011-12-12 00:01:39 +010011305 else {
11306 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11307 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011308 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011309
Victor Stinner488fa492011-12-12 00:01:39 +010011310 /* Concat the two Unicode strings */
11311 res = PyUnicode_New(new_len, maxchar);
11312 if (res == NULL)
11313 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011314 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11315 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011316 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011317 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011318 }
11319 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011320 return;
11321
11322error:
Victor Stinner488fa492011-12-12 00:01:39 +010011323 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011324}
11325
11326void
11327PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 PyUnicode_Append(pleft, right);
11330 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011331}
11332
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011333PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011336Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011337string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011338interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
11340static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011341unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011343 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011344 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011345 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011347 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 void *buf1, *buf2;
11349 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
Jesus Ceaac451502011-04-20 17:09:23 +020011351 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11352 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 kind1 = PyUnicode_KIND(self);
11356 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011357 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011358 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011359 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 len1 = PyUnicode_GET_LENGTH(self);
11362 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011364 if (end - start < len2) {
11365 Py_DECREF(substring);
11366 return PyLong_FromLong(0);
11367 }
11368 buf1 = PyUnicode_DATA(self);
11369 buf2 = PyUnicode_DATA(substring);
11370 if (kind2 != kind1) {
11371 buf2 = _PyUnicode_AsKind(substring, kind1);
11372 if (!buf2) {
11373 Py_DECREF(substring);
11374 return NULL;
11375 }
11376 }
11377 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 case PyUnicode_1BYTE_KIND:
11379 iresult = ucs1lib_count(
11380 ((Py_UCS1*)buf1) + start, end - start,
11381 buf2, len2, PY_SSIZE_T_MAX
11382 );
11383 break;
11384 case PyUnicode_2BYTE_KIND:
11385 iresult = ucs2lib_count(
11386 ((Py_UCS2*)buf1) + start, end - start,
11387 buf2, len2, PY_SSIZE_T_MAX
11388 );
11389 break;
11390 case PyUnicode_4BYTE_KIND:
11391 iresult = ucs4lib_count(
11392 ((Py_UCS4*)buf1) + start, end - start,
11393 buf2, len2, PY_SSIZE_T_MAX
11394 );
11395 break;
11396 default:
11397 assert(0); iresult = 0;
11398 }
11399
11400 result = PyLong_FromSsize_t(iresult);
11401
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011402 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
11405 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011406
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 return result;
11408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011411 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011413Encode S using the codec registered for encoding. Default encoding\n\
11414is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011415handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011416a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11417'xmlcharrefreplace' as well as any other name registered with\n\
11418codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
11420static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011421unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011423 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 char *encoding = NULL;
11425 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011426
Benjamin Peterson308d6372009-09-18 21:42:35 +000011427 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11428 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011430 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011434 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
11436Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011440unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011442 Py_ssize_t i, j, line_pos, src_len, incr;
11443 Py_UCS4 ch;
11444 PyObject *u;
11445 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011446 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011448 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011449 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
Ezio Melotti745d54d2013-11-16 19:10:57 +020011451 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11452 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
Antoine Pitrou22425222011-10-04 19:10:51 +020011455 if (PyUnicode_READY(self) == -1)
11456 return NULL;
11457
Thomas Wouters7e474022000-07-16 12:04:32 +000011458 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011459 src_len = PyUnicode_GET_LENGTH(self);
11460 i = j = line_pos = 0;
11461 kind = PyUnicode_KIND(self);
11462 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011463 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011464 for (; i < src_len; i++) {
11465 ch = PyUnicode_READ(kind, src_data, i);
11466 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011467 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011469 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011471 goto overflow;
11472 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011474 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 goto overflow;
11479 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 if (ch == '\n' || ch == '\r')
11482 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011485 if (!found)
11486 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011487
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011489 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 if (!u)
11491 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011492 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493
Antoine Pitroue71d5742011-10-04 15:55:09 +020011494 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
Antoine Pitroue71d5742011-10-04 15:55:09 +020011496 for (; i < src_len; i++) {
11497 ch = PyUnicode_READ(kind, src_data, i);
11498 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011500 incr = tabsize - (line_pos % tabsize);
11501 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011502 FILL(kind, dest_data, ' ', j, incr);
11503 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011505 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011507 line_pos++;
11508 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011509 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011510 if (ch == '\n' || ch == '\r')
11511 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011513 }
11514 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011515 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011516
Antoine Pitroue71d5742011-10-04 15:55:09 +020011517 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011518 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520}
11521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011522PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524\n\
11525Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011526such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527arguments start and end are interpreted as in slice notation.\n\
11528\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
11531static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011534 /* initialize variables to prevent gcc warning */
11535 PyObject *substring = NULL;
11536 Py_ssize_t start = 0;
11537 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011538 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539
Jesus Ceaac451502011-04-20 17:09:23 +020011540 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11541 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
Christian Heimesd47802e2013-06-29 21:33:36 +020011544 if (PyUnicode_READY(self) == -1) {
11545 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011547 }
11548 if (PyUnicode_READY(substring) == -1) {
11549 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552
Victor Stinner7931d9a2011-11-04 00:22:48 +010011553 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 if (result == -2)
11558 return NULL;
11559
Christian Heimes217cfd12007-12-02 14:31:20 +000011560 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561}
11562
11563static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011564unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011566 void *data;
11567 enum PyUnicode_Kind kind;
11568 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011569
11570 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11571 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011573 }
11574 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11575 PyErr_SetString(PyExc_IndexError, "string index out of range");
11576 return NULL;
11577 }
11578 kind = PyUnicode_KIND(self);
11579 data = PyUnicode_DATA(self);
11580 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011581 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582}
11583
Guido van Rossumc2504932007-09-18 19:42:40 +000011584/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011585 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011586static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011587unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Guido van Rossumc2504932007-09-18 19:42:40 +000011589 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011590 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011591
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011592#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011593 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011594#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if (_PyUnicode_HASH(self) != -1)
11596 return _PyUnicode_HASH(self);
11597 if (PyUnicode_READY(self) == -1)
11598 return -1;
11599 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011600 /*
11601 We make the hash of the empty string be 0, rather than using
11602 (prefix ^ suffix), since this slightly obfuscates the hash secret
11603 */
11604 if (len == 0) {
11605 _PyUnicode_HASH(self) = 0;
11606 return 0;
11607 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011608 x = _Py_HashBytes(PyUnicode_DATA(self),
11609 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011611 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612}
11613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011614PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
11619static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011622 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011623 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011624 PyObject *substring = NULL;
11625 Py_ssize_t start = 0;
11626 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
Jesus Ceaac451502011-04-20 17:09:23 +020011628 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11629 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
Christian Heimesd47a0452013-06-29 21:21:37 +020011632 if (PyUnicode_READY(self) == -1) {
11633 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011635 }
11636 if (PyUnicode_READY(substring) == -1) {
11637 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640
Victor Stinner7931d9a2011-11-04 00:22:48 +010011641 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
11643 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 if (result == -2)
11646 return NULL;
11647
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 if (result < 0) {
11649 PyErr_SetString(PyExc_ValueError, "substring not found");
11650 return NULL;
11651 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011652
Christian Heimes217cfd12007-12-02 14:31:20 +000011653 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654}
11655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011656PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011659Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011660at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
11662static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 Py_ssize_t i, length;
11666 int kind;
11667 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 int cased;
11669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 if (PyUnicode_READY(self) == -1)
11671 return NULL;
11672 length = PyUnicode_GET_LENGTH(self);
11673 kind = PyUnicode_KIND(self);
11674 data = PyUnicode_DATA(self);
11675
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 if (length == 1)
11678 return PyBool_FromLong(
11679 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011681 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011684
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 for (i = 0; i < length; i++) {
11687 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011688
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11690 return PyBool_FromLong(0);
11691 else if (!cased && Py_UNICODE_ISLOWER(ch))
11692 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011694 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695}
11696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011697PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011700Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011701at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
11703static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011704unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 Py_ssize_t i, length;
11707 int kind;
11708 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709 int cased;
11710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (PyUnicode_READY(self) == -1)
11712 return NULL;
11713 length = PyUnicode_GET_LENGTH(self);
11714 kind = PyUnicode_KIND(self);
11715 data = PyUnicode_DATA(self);
11716
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 if (length == 1)
11719 return PyBool_FromLong(
11720 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011722 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011725
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 for (i = 0; i < length; i++) {
11728 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011729
Benjamin Peterson29060642009-01-31 22:14:21 +000011730 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11731 return PyBool_FromLong(0);
11732 else if (!cased && Py_UNICODE_ISUPPER(ch))
11733 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011735 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736}
11737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011738PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011741Return True if S is a titlecased string and there is at least one\n\
11742character in S, i.e. upper- and titlecase characters may only\n\
11743follow uncased characters and lowercase characters only cased ones.\n\
11744Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745
11746static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011747unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 Py_ssize_t i, length;
11750 int kind;
11751 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 int cased, previous_is_cased;
11753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (PyUnicode_READY(self) == -1)
11755 return NULL;
11756 length = PyUnicode_GET_LENGTH(self);
11757 kind = PyUnicode_KIND(self);
11758 data = PyUnicode_DATA(self);
11759
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 if (length == 1) {
11762 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11763 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11764 (Py_UNICODE_ISUPPER(ch) != 0));
11765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011767 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011770
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 cased = 0;
11772 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 for (i = 0; i < length; i++) {
11774 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011775
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11777 if (previous_is_cased)
11778 return PyBool_FromLong(0);
11779 previous_is_cased = 1;
11780 cased = 1;
11781 }
11782 else if (Py_UNICODE_ISLOWER(ch)) {
11783 if (!previous_is_cased)
11784 return PyBool_FromLong(0);
11785 previous_is_cased = 1;
11786 cased = 1;
11787 }
11788 else
11789 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011791 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792}
11793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011794PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011797Return True if all characters in S are whitespace\n\
11798and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
11800static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011801unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 Py_ssize_t i, length;
11804 int kind;
11805 void *data;
11806
11807 if (PyUnicode_READY(self) == -1)
11808 return NULL;
11809 length = PyUnicode_GET_LENGTH(self);
11810 kind = PyUnicode_KIND(self);
11811 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (length == 1)
11815 return PyBool_FromLong(
11816 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011818 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 for (i = 0; i < length; i++) {
11823 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011824 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011827 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828}
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011832\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011833Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011835
11836static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011837unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 Py_ssize_t i, length;
11840 int kind;
11841 void *data;
11842
11843 if (PyUnicode_READY(self) == -1)
11844 return NULL;
11845 length = PyUnicode_GET_LENGTH(self);
11846 kind = PyUnicode_KIND(self);
11847 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011848
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011849 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (length == 1)
11851 return PyBool_FromLong(
11852 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011853
11854 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 for (i = 0; i < length; i++) {
11859 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011861 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011862 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011863}
11864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011865PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011867\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011868Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011869and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870
11871static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011872unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 int kind;
11875 void *data;
11876 Py_ssize_t len, i;
11877
11878 if (PyUnicode_READY(self) == -1)
11879 return NULL;
11880
11881 kind = PyUnicode_KIND(self);
11882 data = PyUnicode_DATA(self);
11883 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011884
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011885 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (len == 1) {
11887 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11888 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11889 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011890
11891 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 for (i = 0; i < len; i++) {
11896 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011897 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011900 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011901}
11902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011903PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011906Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011907False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908
11909static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011910unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 Py_ssize_t i, length;
11913 int kind;
11914 void *data;
11915
11916 if (PyUnicode_READY(self) == -1)
11917 return NULL;
11918 length = PyUnicode_GET_LENGTH(self);
11919 kind = PyUnicode_KIND(self);
11920 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 if (length == 1)
11924 return PyBool_FromLong(
11925 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011927 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 for (i = 0; i < length; i++) {
11932 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011935 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936}
11937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011938PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011941Return True if all characters in S are digits\n\
11942and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943
11944static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011945unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 Py_ssize_t i, length;
11948 int kind;
11949 void *data;
11950
11951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 length = PyUnicode_GET_LENGTH(self);
11954 kind = PyUnicode_KIND(self);
11955 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (length == 1) {
11959 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11960 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011963 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 for (i = 0; i < length; i++) {
11968 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011971 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972}
11973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011974PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011977Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011978False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
11980static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011981unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 Py_ssize_t i, length;
11984 int kind;
11985 void *data;
11986
11987 if (PyUnicode_READY(self) == -1)
11988 return NULL;
11989 length = PyUnicode_GET_LENGTH(self);
11990 kind = PyUnicode_KIND(self);
11991 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 if (length == 1)
11995 return PyBool_FromLong(
11996 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011998 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 for (i = 0; i < length; i++) {
12003 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012006 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007}
12008
Martin v. Löwis47383402007-08-15 07:32:56 +000012009int
12010PyUnicode_IsIdentifier(PyObject *self)
12011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 int kind;
12013 void *data;
12014 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012015 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (PyUnicode_READY(self) == -1) {
12018 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 }
12021
12022 /* Special case for empty strings */
12023 if (PyUnicode_GET_LENGTH(self) == 0)
12024 return 0;
12025 kind = PyUnicode_KIND(self);
12026 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012027
12028 /* PEP 3131 says that the first character must be in
12029 XID_Start and subsequent characters in XID_Continue,
12030 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012031 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012032 letters, digits, underscore). However, given the current
12033 definition of XID_Start and XID_Continue, it is sufficient
12034 to check just for these, except that _ must be allowed
12035 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012037 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012038 return 0;
12039
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012040 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012042 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012043 return 1;
12044}
12045
12046PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012048\n\
12049Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012050to the language definition.\n\
12051\n\
12052Use keyword.iskeyword() to test for reserved identifiers\n\
12053such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012054
12055static PyObject*
12056unicode_isidentifier(PyObject *self)
12057{
12058 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12059}
12060
Georg Brandl559e5d72008-06-11 18:37:52 +000012061PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012063\n\
12064Return True if all characters in S are considered\n\
12065printable in repr() or S is empty, False otherwise.");
12066
12067static PyObject*
12068unicode_isprintable(PyObject *self)
12069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 Py_ssize_t i, length;
12071 int kind;
12072 void *data;
12073
12074 if (PyUnicode_READY(self) == -1)
12075 return NULL;
12076 length = PyUnicode_GET_LENGTH(self);
12077 kind = PyUnicode_KIND(self);
12078 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012079
12080 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 if (length == 1)
12082 return PyBool_FromLong(
12083 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 for (i = 0; i < length; i++) {
12086 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012087 Py_RETURN_FALSE;
12088 }
12089 }
12090 Py_RETURN_TRUE;
12091}
12092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012093PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012094 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095\n\
12096Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012097iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012100unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012102 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103}
12104
Martin v. Löwis18e16552006-02-15 17:27:45 +000012105static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012106unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (PyUnicode_READY(self) == -1)
12109 return -1;
12110 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111}
12112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012113PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012116Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012117done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
12119static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012120unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012122 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 Py_UCS4 fillchar = ' ';
12124
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012125 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126 return NULL;
12127
Benjamin Petersonbac79492012-01-14 13:34:47 -050012128 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
Victor Stinnerc4b49542011-12-11 22:44:26 +010012131 if (PyUnicode_GET_LENGTH(self) >= width)
12132 return unicode_result_unchanged(self);
12133
12134 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135}
12136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012137PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012140Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141
12142static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012143unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012145 if (PyUnicode_READY(self) == -1)
12146 return NULL;
12147 if (PyUnicode_IS_ASCII(self))
12148 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012149 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150}
12151
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012152#define LEFTSTRIP 0
12153#define RIGHTSTRIP 1
12154#define BOTHSTRIP 2
12155
12156/* Arrays indexed by above */
12157static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12158
12159#define STRIPNAME(i) (stripformat[i]+3)
12160
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161/* externally visible for str.strip(unicode) */
12162PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 void *data;
12166 int kind;
12167 Py_ssize_t i, j, len;
12168 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012169 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12172 return NULL;
12173
12174 kind = PyUnicode_KIND(self);
12175 data = PyUnicode_DATA(self);
12176 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012177 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12179 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012180 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012181
Benjamin Peterson14339b62009-01-31 16:36:08 +000012182 i = 0;
12183 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012184 while (i < len) {
12185 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12186 if (!BLOOM(sepmask, ch))
12187 break;
12188 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12189 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012190 i++;
12191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012192 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012193
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194 j = len;
12195 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012196 j--;
12197 while (j >= i) {
12198 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12199 if (!BLOOM(sepmask, ch))
12200 break;
12201 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12202 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012204 }
12205
Benjamin Peterson29060642009-01-31 22:14:21 +000012206 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012207 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012208
Victor Stinner7931d9a2011-11-04 00:22:48 +010012209 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210}
12211
12212PyObject*
12213PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12214{
12215 unsigned char *data;
12216 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012217 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218
Victor Stinnerde636f32011-10-01 03:55:54 +020012219 if (PyUnicode_READY(self) == -1)
12220 return NULL;
12221
Victor Stinner684d5fd2012-05-03 02:32:34 +020012222 length = PyUnicode_GET_LENGTH(self);
12223 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012224
Victor Stinner684d5fd2012-05-03 02:32:34 +020012225 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012226 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227
Victor Stinnerde636f32011-10-01 03:55:54 +020012228 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012229 PyErr_SetString(PyExc_IndexError, "string index out of range");
12230 return NULL;
12231 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012232 if (start >= length || end < start)
12233 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012234
Victor Stinner684d5fd2012-05-03 02:32:34 +020012235 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012236 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012237 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012238 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012239 }
12240 else {
12241 kind = PyUnicode_KIND(self);
12242 data = PyUnicode_1BYTE_DATA(self);
12243 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012244 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012245 length);
12246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248
12249static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012250do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 Py_ssize_t len, i, j;
12253
12254 if (PyUnicode_READY(self) == -1)
12255 return NULL;
12256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012258
Victor Stinnercc7af722013-04-09 22:39:24 +020012259 if (PyUnicode_IS_ASCII(self)) {
12260 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12261
12262 i = 0;
12263 if (striptype != RIGHTSTRIP) {
12264 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012265 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012266 if (!_Py_ascii_whitespace[ch])
12267 break;
12268 i++;
12269 }
12270 }
12271
12272 j = len;
12273 if (striptype != LEFTSTRIP) {
12274 j--;
12275 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012276 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012277 if (!_Py_ascii_whitespace[ch])
12278 break;
12279 j--;
12280 }
12281 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012282 }
12283 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012284 else {
12285 int kind = PyUnicode_KIND(self);
12286 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287
Victor Stinnercc7af722013-04-09 22:39:24 +020012288 i = 0;
12289 if (striptype != RIGHTSTRIP) {
12290 while (i < len) {
12291 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12292 if (!Py_UNICODE_ISSPACE(ch))
12293 break;
12294 i++;
12295 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012296 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012297
12298 j = len;
12299 if (striptype != LEFTSTRIP) {
12300 j--;
12301 while (j >= i) {
12302 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12303 if (!Py_UNICODE_ISSPACE(ch))
12304 break;
12305 j--;
12306 }
12307 j++;
12308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310
Victor Stinner7931d9a2011-11-04 00:22:48 +010012311 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312}
12313
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012314
12315static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012316do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319
Serhiy Storchakac6792272013-10-19 21:03:34 +030012320 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012322
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 if (sep != NULL && sep != Py_None) {
12324 if (PyUnicode_Check(sep))
12325 return _PyUnicode_XStrip(self, striptype, sep);
12326 else {
12327 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 "%s arg must be None or str",
12329 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 return NULL;
12331 }
12332 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012333
Benjamin Peterson14339b62009-01-31 16:36:08 +000012334 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012335}
12336
12337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012338PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340\n\
12341Return a copy of the string S with leading and trailing\n\
12342whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012343If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012344
12345static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012346unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012348 if (PyTuple_GET_SIZE(args) == 0)
12349 return do_strip(self, BOTHSTRIP); /* Common case */
12350 else
12351 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012352}
12353
12354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012355PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012357\n\
12358Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012359If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012360
12361static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012362unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012363{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012364 if (PyTuple_GET_SIZE(args) == 0)
12365 return do_strip(self, LEFTSTRIP); /* Common case */
12366 else
12367 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012368}
12369
12370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012371PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012373\n\
12374Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012375If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012376
12377static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012378unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012379{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012380 if (PyTuple_GET_SIZE(args) == 0)
12381 return do_strip(self, RIGHTSTRIP); /* Common case */
12382 else
12383 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012384}
12385
12386
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012388unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012390 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392
Serhiy Storchaka05997252013-01-26 12:14:02 +020012393 if (len < 1)
12394 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395
Victor Stinnerc4b49542011-12-11 22:44:26 +010012396 /* no repeat, return original string */
12397 if (len == 1)
12398 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012399
Benjamin Petersonbac79492012-01-14 13:34:47 -050012400 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 return NULL;
12402
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012403 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012404 PyErr_SetString(PyExc_OverflowError,
12405 "repeated string is too long");
12406 return NULL;
12407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012409
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012410 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411 if (!u)
12412 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012413 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 if (PyUnicode_GET_LENGTH(str) == 1) {
12416 const int kind = PyUnicode_KIND(str);
12417 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012418 if (kind == PyUnicode_1BYTE_KIND) {
12419 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012420 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012421 }
12422 else if (kind == PyUnicode_2BYTE_KIND) {
12423 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012424 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012425 ucs2[n] = fill_char;
12426 } else {
12427 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12428 assert(kind == PyUnicode_4BYTE_KIND);
12429 for (n = 0; n < len; ++n)
12430 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 }
12433 else {
12434 /* number of characters copied this far */
12435 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012436 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 char *to = (char *) PyUnicode_DATA(u);
12438 Py_MEMCPY(to, PyUnicode_DATA(str),
12439 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 n = (done <= nchars-done) ? done : nchars-done;
12442 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445 }
12446
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012447 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012448 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449}
12450
Alexander Belopolsky40018472011-02-26 01:02:56 +000012451PyObject *
12452PyUnicode_Replace(PyObject *obj,
12453 PyObject *subobj,
12454 PyObject *replobj,
12455 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456{
12457 PyObject *self;
12458 PyObject *str1;
12459 PyObject *str2;
12460 PyObject *result;
12461
12462 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012463 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012466 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012467 Py_DECREF(self);
12468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469 }
12470 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012471 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 Py_DECREF(self);
12473 Py_DECREF(str1);
12474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012476 if (PyUnicode_READY(self) == -1 ||
12477 PyUnicode_READY(str1) == -1 ||
12478 PyUnicode_READY(str2) == -1)
12479 result = NULL;
12480 else
12481 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482 Py_DECREF(self);
12483 Py_DECREF(str1);
12484 Py_DECREF(str2);
12485 return result;
12486}
12487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012489 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490\n\
12491Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012492old replaced by new. If the optional argument count is\n\
12493given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494
12495static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 PyObject *str1;
12499 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012500 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501 PyObject *result;
12502
Martin v. Löwis18e16552006-02-15 17:27:45 +000012503 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012505 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012508 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 return NULL;
12510 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012511 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 Py_DECREF(str1);
12513 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012514 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012515 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12516 result = NULL;
12517 else
12518 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520 Py_DECREF(str1);
12521 Py_DECREF(str2);
12522 return result;
12523}
12524
Alexander Belopolsky40018472011-02-26 01:02:56 +000012525static PyObject *
12526unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012528 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 Py_ssize_t isize;
12530 Py_ssize_t osize, squote, dquote, i, o;
12531 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012532 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012536 return NULL;
12537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 isize = PyUnicode_GET_LENGTH(unicode);
12539 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 /* Compute length of output, quote characters, and
12542 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012543 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 max = 127;
12545 squote = dquote = 0;
12546 ikind = PyUnicode_KIND(unicode);
12547 for (i = 0; i < isize; i++) {
12548 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012549 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012551 case '\'': squote++; break;
12552 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012554 incr = 2;
12555 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 default:
12557 /* Fast-path ASCII */
12558 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012559 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012561 ;
12562 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012565 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012567 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012569 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012571 if (osize > PY_SSIZE_T_MAX - incr) {
12572 PyErr_SetString(PyExc_OverflowError,
12573 "string is too long to generate repr");
12574 return NULL;
12575 }
12576 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 }
12578
12579 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012580 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012582 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 if (dquote)
12584 /* Both squote and dquote present. Use squote,
12585 and escape them */
12586 osize += squote;
12587 else
12588 quote = '"';
12589 }
Victor Stinner55c08782013-04-14 18:45:39 +020012590 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591
12592 repr = PyUnicode_New(osize, max);
12593 if (repr == NULL)
12594 return NULL;
12595 okind = PyUnicode_KIND(repr);
12596 odata = PyUnicode_DATA(repr);
12597
12598 PyUnicode_WRITE(okind, odata, 0, quote);
12599 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012600 if (unchanged) {
12601 _PyUnicode_FastCopyCharacters(repr, 1,
12602 unicode, 0,
12603 isize);
12604 }
12605 else {
12606 for (i = 0, o = 1; i < isize; i++) {
12607 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608
Victor Stinner55c08782013-04-14 18:45:39 +020012609 /* Escape quotes and backslashes */
12610 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012611 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012613 continue;
12614 }
12615
12616 /* Map special whitespace to '\t', \n', '\r' */
12617 if (ch == '\t') {
12618 PyUnicode_WRITE(okind, odata, o++, '\\');
12619 PyUnicode_WRITE(okind, odata, o++, 't');
12620 }
12621 else if (ch == '\n') {
12622 PyUnicode_WRITE(okind, odata, o++, '\\');
12623 PyUnicode_WRITE(okind, odata, o++, 'n');
12624 }
12625 else if (ch == '\r') {
12626 PyUnicode_WRITE(okind, odata, o++, '\\');
12627 PyUnicode_WRITE(okind, odata, o++, 'r');
12628 }
12629
12630 /* Map non-printable US ASCII to '\xhh' */
12631 else if (ch < ' ' || ch == 0x7F) {
12632 PyUnicode_WRITE(okind, odata, o++, '\\');
12633 PyUnicode_WRITE(okind, odata, o++, 'x');
12634 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12635 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12636 }
12637
12638 /* Copy ASCII characters as-is */
12639 else if (ch < 0x7F) {
12640 PyUnicode_WRITE(okind, odata, o++, ch);
12641 }
12642
12643 /* Non-ASCII characters */
12644 else {
12645 /* Map Unicode whitespace and control characters
12646 (categories Z* and C* except ASCII space)
12647 */
12648 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12649 PyUnicode_WRITE(okind, odata, o++, '\\');
12650 /* Map 8-bit characters to '\xhh' */
12651 if (ch <= 0xff) {
12652 PyUnicode_WRITE(okind, odata, o++, 'x');
12653 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12654 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12655 }
12656 /* Map 16-bit characters to '\uxxxx' */
12657 else if (ch <= 0xffff) {
12658 PyUnicode_WRITE(okind, odata, o++, 'u');
12659 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12660 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12661 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12662 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12663 }
12664 /* Map 21-bit characters to '\U00xxxxxx' */
12665 else {
12666 PyUnicode_WRITE(okind, odata, o++, 'U');
12667 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12668 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12669 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12670 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12671 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12672 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12673 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12675 }
12676 }
12677 /* Copy characters as-is */
12678 else {
12679 PyUnicode_WRITE(okind, odata, o++, ch);
12680 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012681 }
12682 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012685 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012686 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687}
12688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691\n\
12692Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012693such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694arguments start and end are interpreted as in slice notation.\n\
12695\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012696Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
12698static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012701 /* initialize variables to prevent gcc warning */
12702 PyObject *substring = NULL;
12703 Py_ssize_t start = 0;
12704 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012705 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
Jesus Ceaac451502011-04-20 17:09:23 +020012707 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12708 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710
Christian Heimesea71a522013-06-29 21:17:34 +020012711 if (PyUnicode_READY(self) == -1) {
12712 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012714 }
12715 if (PyUnicode_READY(substring) == -1) {
12716 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719
Victor Stinner7931d9a2011-11-04 00:22:48 +010012720 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
12722 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (result == -2)
12725 return NULL;
12726
Christian Heimes217cfd12007-12-02 14:31:20 +000012727 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728}
12729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012730PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012733Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
12735static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012738 /* initialize variables to prevent gcc warning */
12739 PyObject *substring = NULL;
12740 Py_ssize_t start = 0;
12741 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743
Jesus Ceaac451502011-04-20 17:09:23 +020012744 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12745 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
Christian Heimesea71a522013-06-29 21:17:34 +020012748 if (PyUnicode_READY(self) == -1) {
12749 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012751 }
12752 if (PyUnicode_READY(substring) == -1) {
12753 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756
Victor Stinner7931d9a2011-11-04 00:22:48 +010012757 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758
12759 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 if (result == -2)
12762 return NULL;
12763
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764 if (result < 0) {
12765 PyErr_SetString(PyExc_ValueError, "substring not found");
12766 return NULL;
12767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768
Christian Heimes217cfd12007-12-02 14:31:20 +000012769 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770}
12771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012772PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012775Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012776done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777
12778static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012779unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012781 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 Py_UCS4 fillchar = ' ';
12783
Victor Stinnere9a29352011-10-01 02:14:59 +020012784 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012786
Benjamin Petersonbac79492012-01-14 13:34:47 -050012787 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788 return NULL;
12789
Victor Stinnerc4b49542011-12-11 22:44:26 +010012790 if (PyUnicode_GET_LENGTH(self) >= width)
12791 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792
Victor Stinnerc4b49542011-12-11 22:44:26 +010012793 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794}
12795
Alexander Belopolsky40018472011-02-26 01:02:56 +000012796PyObject *
12797PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798{
12799 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012800
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801 s = PyUnicode_FromObject(s);
12802 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012803 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 if (sep != NULL) {
12805 sep = PyUnicode_FromObject(sep);
12806 if (sep == NULL) {
12807 Py_DECREF(s);
12808 return NULL;
12809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810 }
12811
Victor Stinner9310abb2011-10-05 00:59:23 +020012812 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813
12814 Py_DECREF(s);
12815 Py_XDECREF(sep);
12816 return result;
12817}
12818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012819PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012820 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821\n\
12822Return a list of the words in S, using sep as the\n\
12823delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012824splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012825whitespace string is a separator and empty strings are\n\
12826removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827
12828static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012829unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012831 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012833 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012835 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12836 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837 return NULL;
12838
12839 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012842 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012844 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845}
12846
Thomas Wouters477c8d52006-05-27 19:21:47 +000012847PyObject *
12848PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12849{
12850 PyObject* str_obj;
12851 PyObject* sep_obj;
12852 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012853 int kind1, kind2;
12854 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012856
12857 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012858 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012860 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012861 if (!sep_obj) {
12862 Py_DECREF(str_obj);
12863 return NULL;
12864 }
12865 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12866 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867 Py_DECREF(str_obj);
12868 return NULL;
12869 }
12870
Victor Stinner14f8f022011-10-05 20:58:25 +020012871 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 len1 = PyUnicode_GET_LENGTH(str_obj);
12874 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012875 if (kind1 < kind2 || len1 < len2) {
12876 _Py_INCREF_UNICODE_EMPTY();
12877 if (!unicode_empty)
12878 out = NULL;
12879 else {
12880 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12881 Py_DECREF(unicode_empty);
12882 }
12883 Py_DECREF(sep_obj);
12884 Py_DECREF(str_obj);
12885 return out;
12886 }
12887 buf1 = PyUnicode_DATA(str_obj);
12888 buf2 = PyUnicode_DATA(sep_obj);
12889 if (kind2 != kind1) {
12890 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12891 if (!buf2)
12892 goto onError;
12893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012895 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012897 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12898 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12899 else
12900 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901 break;
12902 case PyUnicode_2BYTE_KIND:
12903 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12904 break;
12905 case PyUnicode_4BYTE_KIND:
12906 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12907 break;
12908 default:
12909 assert(0);
12910 out = 0;
12911 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912
12913 Py_DECREF(sep_obj);
12914 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012915 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012917
12918 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 onError:
12920 Py_DECREF(sep_obj);
12921 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 PyMem_Free(buf2);
12924 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925}
12926
12927
12928PyObject *
12929PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12930{
12931 PyObject* str_obj;
12932 PyObject* sep_obj;
12933 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012934 int kind1, kind2;
12935 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937
12938 str_obj = PyUnicode_FromObject(str_in);
12939 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012940 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012941 sep_obj = PyUnicode_FromObject(sep_in);
12942 if (!sep_obj) {
12943 Py_DECREF(str_obj);
12944 return NULL;
12945 }
12946
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012947 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 len1 = PyUnicode_GET_LENGTH(str_obj);
12950 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 if (kind1 < kind2 || len1 < len2) {
12952 _Py_INCREF_UNICODE_EMPTY();
12953 if (!unicode_empty)
12954 out = NULL;
12955 else {
12956 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12957 Py_DECREF(unicode_empty);
12958 }
12959 Py_DECREF(sep_obj);
12960 Py_DECREF(str_obj);
12961 return out;
12962 }
12963 buf1 = PyUnicode_DATA(str_obj);
12964 buf2 = PyUnicode_DATA(sep_obj);
12965 if (kind2 != kind1) {
12966 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12967 if (!buf2)
12968 goto onError;
12969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012971 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012973 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12974 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12975 else
12976 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 break;
12978 case PyUnicode_2BYTE_KIND:
12979 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12980 break;
12981 case PyUnicode_4BYTE_KIND:
12982 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12983 break;
12984 default:
12985 assert(0);
12986 out = 0;
12987 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988
12989 Py_DECREF(sep_obj);
12990 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012991 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993
12994 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 onError:
12996 Py_DECREF(sep_obj);
12997 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012998 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 PyMem_Free(buf2);
13000 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013001}
13002
13003PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013005\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000013006Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013007the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000013008found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009
13010static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013011unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013012{
Victor Stinner9310abb2011-10-05 00:59:23 +020013013 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013014}
13015
13016PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000013017 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013018\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000013019Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000013020the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000013021separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013022
13023static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013024unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013025{
Victor Stinner9310abb2011-10-05 00:59:23 +020013026 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013027}
13028
Alexander Belopolsky40018472011-02-26 01:02:56 +000013029PyObject *
13030PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013031{
13032 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013034 s = PyUnicode_FromObject(s);
13035 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013036 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 if (sep != NULL) {
13038 sep = PyUnicode_FromObject(sep);
13039 if (sep == NULL) {
13040 Py_DECREF(s);
13041 return NULL;
13042 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013043 }
13044
Victor Stinner9310abb2011-10-05 00:59:23 +020013045 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013046
13047 Py_DECREF(s);
13048 Py_XDECREF(sep);
13049 return result;
13050}
13051
13052PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013053 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013054\n\
13055Return a list of the words in S, using sep as the\n\
13056delimiter string, starting at the end of the string and\n\
13057working to the front. If maxsplit is given, at most maxsplit\n\
13058splits are done. If sep is not specified, any whitespace string\n\
13059is a separator.");
13060
13061static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013062unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013063{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013064 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013065 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013066 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013067
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013068 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13069 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013070 return NULL;
13071
13072 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013074 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013075 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013076 else
Victor Stinner9310abb2011-10-05 00:59:23 +020013077 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013078}
13079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013080PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082\n\
13083Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013084Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013085is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
13087static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013088unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013090 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013091 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013093 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13094 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095 return NULL;
13096
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013097 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098}
13099
13100static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013101PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013103 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104}
13105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013106PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013107 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108\n\
13109Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013110and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
13112static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013113unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013115 if (PyUnicode_READY(self) == -1)
13116 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013117 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118}
13119
Larry Hastings61272b72014-01-07 12:41:53 -080013120/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013121
Larry Hastings31826802013-10-19 00:09:25 -070013122@staticmethod
13123str.maketrans as unicode_maketrans
13124
13125 x: object
13126
13127 y: unicode=NULL
13128
13129 z: unicode=NULL
13130
13131 /
13132
13133Return a translation table usable for str.translate().
13134
13135If there is only one argument, it must be a dictionary mapping Unicode
13136ordinals (integers) or characters to Unicode ordinals, strings or None.
13137Character keys will be then converted to ordinals.
13138If there are two arguments, they must be strings of equal length, and
13139in the resulting dictionary, each character in x will be mapped to the
13140character at the same position in y. If there is a third argument, it
13141must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013142[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013143
Larry Hastings31826802013-10-19 00:09:25 -070013144static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013145unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013146/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013147{
Georg Brandlceee0772007-11-27 23:48:05 +000013148 PyObject *new = NULL, *key, *value;
13149 Py_ssize_t i = 0;
13150 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151
Georg Brandlceee0772007-11-27 23:48:05 +000013152 new = PyDict_New();
13153 if (!new)
13154 return NULL;
13155 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 int x_kind, y_kind, z_kind;
13157 void *x_data, *y_data, *z_data;
13158
Georg Brandlceee0772007-11-27 23:48:05 +000013159 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013160 if (!PyUnicode_Check(x)) {
13161 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13162 "be a string if there is a second argument");
13163 goto err;
13164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013166 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13167 "arguments must have equal length");
13168 goto err;
13169 }
13170 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 x_kind = PyUnicode_KIND(x);
13172 y_kind = PyUnicode_KIND(y);
13173 x_data = PyUnicode_DATA(x);
13174 y_data = PyUnicode_DATA(y);
13175 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13176 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013177 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013178 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013179 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013180 if (!value) {
13181 Py_DECREF(key);
13182 goto err;
13183 }
Georg Brandlceee0772007-11-27 23:48:05 +000013184 res = PyDict_SetItem(new, key, value);
13185 Py_DECREF(key);
13186 Py_DECREF(value);
13187 if (res < 0)
13188 goto err;
13189 }
13190 /* create entries for deleting chars in z */
13191 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 z_kind = PyUnicode_KIND(z);
13193 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013194 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013196 if (!key)
13197 goto err;
13198 res = PyDict_SetItem(new, key, Py_None);
13199 Py_DECREF(key);
13200 if (res < 0)
13201 goto err;
13202 }
13203 }
13204 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 int kind;
13206 void *data;
13207
Georg Brandlceee0772007-11-27 23:48:05 +000013208 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013209 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013210 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13211 "to maketrans it must be a dict");
13212 goto err;
13213 }
13214 /* copy entries into the new dict, converting string keys to int keys */
13215 while (PyDict_Next(x, &i, &key, &value)) {
13216 if (PyUnicode_Check(key)) {
13217 /* convert string keys to integer keys */
13218 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013219 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013220 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13221 "table must be of length 1");
13222 goto err;
13223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013224 kind = PyUnicode_KIND(key);
13225 data = PyUnicode_DATA(key);
13226 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013227 if (!newkey)
13228 goto err;
13229 res = PyDict_SetItem(new, newkey, value);
13230 Py_DECREF(newkey);
13231 if (res < 0)
13232 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013233 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013234 /* just keep integer keys */
13235 if (PyDict_SetItem(new, key, value) < 0)
13236 goto err;
13237 } else {
13238 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13239 "be strings or integers");
13240 goto err;
13241 }
13242 }
13243 }
13244 return new;
13245 err:
13246 Py_DECREF(new);
13247 return NULL;
13248}
13249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013250PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013253Return a copy of the string S in which each character has been mapped\n\
13254through the given translation table. The table must implement\n\
13255lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13256mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13257this operation raises LookupError, the character is left untouched.\n\
13258Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259
13260static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264}
13265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013266PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013269Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270
13271static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013272unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013274 if (PyUnicode_READY(self) == -1)
13275 return NULL;
13276 if (PyUnicode_IS_ASCII(self))
13277 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013278 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279}
13280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013281PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013284Pad a numeric string S with zeros on the left, to fill a field\n\
13285of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286
13287static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013288unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013290 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013291 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013292 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 int kind;
13294 void *data;
13295 Py_UCS4 chr;
13296
Martin v. Löwis18e16552006-02-15 17:27:45 +000013297 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298 return NULL;
13299
Benjamin Petersonbac79492012-01-14 13:34:47 -050013300 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
Victor Stinnerc4b49542011-12-11 22:44:26 +010013303 if (PyUnicode_GET_LENGTH(self) >= width)
13304 return unicode_result_unchanged(self);
13305
13306 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307
13308 u = pad(self, fill, 0, '0');
13309
Walter Dörwald068325e2002-04-15 13:36:47 +000013310 if (u == NULL)
13311 return NULL;
13312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 kind = PyUnicode_KIND(u);
13314 data = PyUnicode_DATA(u);
13315 chr = PyUnicode_READ(kind, data, fill);
13316
13317 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 PyUnicode_WRITE(kind, data, 0, chr);
13320 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013321 }
13322
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013323 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013324 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
13327#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013328static PyObject *
13329unicode__decimal2ascii(PyObject *self)
13330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013332}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333#endif
13334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013335PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013336 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013338Return True if S starts with the specified prefix, False otherwise.\n\
13339With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013340With optional end, stop comparing S at that position.\n\
13341prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342
13343static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013344unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013347 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013348 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013349 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013350 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013351 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352
Jesus Ceaac451502011-04-20 17:09:23 +020013353 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013355 if (PyTuple_Check(subobj)) {
13356 Py_ssize_t i;
13357 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013358 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013359 if (substring == NULL)
13360 return NULL;
13361 result = tailmatch(self, substring, start, end, -1);
13362 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013363 if (result == -1)
13364 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013365 if (result) {
13366 Py_RETURN_TRUE;
13367 }
13368 }
13369 /* nothing matched */
13370 Py_RETURN_FALSE;
13371 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013372 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013373 if (substring == NULL) {
13374 if (PyErr_ExceptionMatches(PyExc_TypeError))
13375 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13376 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013378 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013381 if (result == -1)
13382 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384}
13385
13386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013387PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013390Return True if S ends with the specified suffix, False otherwise.\n\
13391With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013392With optional end, stop comparing S at that position.\n\
13393suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394
13395static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013396unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013399 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013400 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013401 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013402 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404
Jesus Ceaac451502011-04-20 17:09:23 +020013405 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013407 if (PyTuple_Check(subobj)) {
13408 Py_ssize_t i;
13409 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013410 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013412 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013414 result = tailmatch(self, substring, start, end, +1);
13415 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013416 if (result == -1)
13417 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013418 if (result) {
13419 Py_RETURN_TRUE;
13420 }
13421 }
13422 Py_RETURN_FALSE;
13423 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013424 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013425 if (substring == NULL) {
13426 if (PyErr_ExceptionMatches(PyExc_TypeError))
13427 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13428 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013430 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013431 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013432 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013433 if (result == -1)
13434 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013435 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436}
13437
Victor Stinner202fdca2012-05-07 12:47:02 +020013438Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013439_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013440{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013441 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13442 writer->data = PyUnicode_DATA(writer->buffer);
13443
13444 if (!writer->readonly) {
13445 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013446 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013447 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013448 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013449 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13450 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13451 writer->kind = PyUnicode_WCHAR_KIND;
13452 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13453
Victor Stinner8f674cc2013-04-17 23:02:17 +020013454 /* Copy-on-write mode: set buffer size to 0 so
13455 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13456 * next write. */
13457 writer->size = 0;
13458 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013459}
13460
Victor Stinnerd3f08822012-05-29 12:57:52 +020013461void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013462_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013463{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013464 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013465
13466 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013467 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013468
13469 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13470 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13471 writer->kind = PyUnicode_WCHAR_KIND;
13472 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013473}
13474
Victor Stinnerd3f08822012-05-29 12:57:52 +020013475int
13476_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13477 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013478{
13479 Py_ssize_t newlen;
13480 PyObject *newbuffer;
13481
Victor Stinnerca9381e2015-09-22 00:58:32 +020013482 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013483 assert((maxchar > writer->maxchar && length >= 0)
13484 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013485
Victor Stinner202fdca2012-05-07 12:47:02 +020013486 if (length > PY_SSIZE_T_MAX - writer->pos) {
13487 PyErr_NoMemory();
13488 return -1;
13489 }
13490 newlen = writer->pos + length;
13491
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013492 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013493
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013495 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013496 if (writer->overallocate
13497 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13498 /* overallocate to limit the number of realloc() */
13499 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013501 if (newlen < writer->min_length)
13502 newlen = writer->min_length;
13503
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 writer->buffer = PyUnicode_New(newlen, maxchar);
13505 if (writer->buffer == NULL)
13506 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013508 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013509 if (writer->overallocate
13510 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13511 /* overallocate to limit the number of realloc() */
13512 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013514 if (newlen < writer->min_length)
13515 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013516
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013517 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013518 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013519 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013520 newbuffer = PyUnicode_New(newlen, maxchar);
13521 if (newbuffer == NULL)
13522 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13524 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013525 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013526 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013527 }
13528 else {
13529 newbuffer = resize_compact(writer->buffer, newlen);
13530 if (newbuffer == NULL)
13531 return -1;
13532 }
13533 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013534 }
13535 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013536 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537 newbuffer = PyUnicode_New(writer->size, maxchar);
13538 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013539 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13541 writer->buffer, 0, writer->pos);
13542 Py_DECREF(writer->buffer);
13543 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013544 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013545 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013546 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013547
13548#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013549}
13550
Victor Stinnerca9381e2015-09-22 00:58:32 +020013551int
13552_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13553 enum PyUnicode_Kind kind)
13554{
13555 Py_UCS4 maxchar;
13556
13557 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13558 assert(writer->kind < kind);
13559
13560 switch (kind)
13561 {
13562 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13563 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13564 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13565 default:
13566 assert(0 && "invalid kind");
13567 return -1;
13568 }
13569
13570 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13571}
13572
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013573Py_LOCAL_INLINE(int)
13574_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013575{
13576 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13577 return -1;
13578 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13579 writer->pos++;
13580 return 0;
13581}
13582
13583int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013584_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13585{
13586 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13587}
13588
13589int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13591{
13592 Py_UCS4 maxchar;
13593 Py_ssize_t len;
13594
13595 if (PyUnicode_READY(str) == -1)
13596 return -1;
13597 len = PyUnicode_GET_LENGTH(str);
13598 if (len == 0)
13599 return 0;
13600 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13601 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013602 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013603 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013604 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013605 Py_INCREF(str);
13606 writer->buffer = str;
13607 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013608 writer->pos += len;
13609 return 0;
13610 }
13611 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13612 return -1;
13613 }
13614 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13615 str, 0, len);
13616 writer->pos += len;
13617 return 0;
13618}
13619
Victor Stinnere215d962012-10-06 23:03:36 +020013620int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013621_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13622 Py_ssize_t start, Py_ssize_t end)
13623{
13624 Py_UCS4 maxchar;
13625 Py_ssize_t len;
13626
13627 if (PyUnicode_READY(str) == -1)
13628 return -1;
13629
13630 assert(0 <= start);
13631 assert(end <= PyUnicode_GET_LENGTH(str));
13632 assert(start <= end);
13633
13634 if (end == 0)
13635 return 0;
13636
13637 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13638 return _PyUnicodeWriter_WriteStr(writer, str);
13639
13640 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13641 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13642 else
13643 maxchar = writer->maxchar;
13644 len = end - start;
13645
13646 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13647 return -1;
13648
13649 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13650 str, start, len);
13651 writer->pos += len;
13652 return 0;
13653}
13654
13655int
Victor Stinner4a587072013-11-19 12:54:53 +010013656_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13657 const char *ascii, Py_ssize_t len)
13658{
13659 if (len == -1)
13660 len = strlen(ascii);
13661
13662 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13663
13664 if (writer->buffer == NULL && !writer->overallocate) {
13665 PyObject *str;
13666
13667 str = _PyUnicode_FromASCII(ascii, len);
13668 if (str == NULL)
13669 return -1;
13670
13671 writer->readonly = 1;
13672 writer->buffer = str;
13673 _PyUnicodeWriter_Update(writer);
13674 writer->pos += len;
13675 return 0;
13676 }
13677
13678 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13679 return -1;
13680
13681 switch (writer->kind)
13682 {
13683 case PyUnicode_1BYTE_KIND:
13684 {
13685 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13686 Py_UCS1 *data = writer->data;
13687
13688 Py_MEMCPY(data + writer->pos, str, len);
13689 break;
13690 }
13691 case PyUnicode_2BYTE_KIND:
13692 {
13693 _PyUnicode_CONVERT_BYTES(
13694 Py_UCS1, Py_UCS2,
13695 ascii, ascii + len,
13696 (Py_UCS2 *)writer->data + writer->pos);
13697 break;
13698 }
13699 case PyUnicode_4BYTE_KIND:
13700 {
13701 _PyUnicode_CONVERT_BYTES(
13702 Py_UCS1, Py_UCS4,
13703 ascii, ascii + len,
13704 (Py_UCS4 *)writer->data + writer->pos);
13705 break;
13706 }
13707 default:
13708 assert(0);
13709 }
13710
13711 writer->pos += len;
13712 return 0;
13713}
13714
13715int
13716_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13717 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013718{
13719 Py_UCS4 maxchar;
13720
13721 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13722 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13723 return -1;
13724 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13725 writer->pos += len;
13726 return 0;
13727}
13728
Victor Stinnerd3f08822012-05-29 12:57:52 +020013729PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013730_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013731{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013732 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013733 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013734 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013735 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013736 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013737 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013738 str = writer->buffer;
13739 writer->buffer = NULL;
13740 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13741 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013742 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013743 if (writer->pos == 0) {
13744 Py_CLEAR(writer->buffer);
13745
13746 /* Get the empty Unicode string singleton ('') */
13747 _Py_INCREF_UNICODE_EMPTY();
13748 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013749 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013750 else {
13751 str = writer->buffer;
13752 writer->buffer = NULL;
13753
13754 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13755 PyObject *str2;
13756 str2 = resize_compact(str, writer->pos);
13757 if (str2 == NULL)
13758 return NULL;
13759 str = str2;
13760 }
13761 }
13762
Victor Stinner15a0bd32013-07-08 22:29:55 +020013763 assert(_PyUnicode_CheckConsistency(str, 1));
13764 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013765}
13766
Victor Stinnerd3f08822012-05-29 12:57:52 +020013767void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013768_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013769{
13770 Py_CLEAR(writer->buffer);
13771}
13772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013774
13775PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013777\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013778Return a formatted version of S, using substitutions from args and kwargs.\n\
13779The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013780
Eric Smith27bbca62010-11-04 17:06:58 +000013781PyDoc_STRVAR(format_map__doc__,
13782 "S.format_map(mapping) -> str\n\
13783\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013784Return a formatted version of S, using substitutions from mapping.\n\
13785The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013786
Eric Smith4a7d76d2008-05-30 18:10:19 +000013787static PyObject *
13788unicode__format__(PyObject* self, PyObject* args)
13789{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013790 PyObject *format_spec;
13791 _PyUnicodeWriter writer;
13792 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013793
13794 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13795 return NULL;
13796
Victor Stinnerd3f08822012-05-29 12:57:52 +020013797 if (PyUnicode_READY(self) == -1)
13798 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013799 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013800 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13801 self, format_spec, 0,
13802 PyUnicode_GET_LENGTH(format_spec));
13803 if (ret == -1) {
13804 _PyUnicodeWriter_Dealloc(&writer);
13805 return NULL;
13806 }
13807 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013808}
13809
Eric Smith8c663262007-08-25 02:26:07 +000013810PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013812\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013813Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013814
13815static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013816unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013818 Py_ssize_t size;
13819
13820 /* If it's a compact object, account for base structure +
13821 character data. */
13822 if (PyUnicode_IS_COMPACT_ASCII(v))
13823 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13824 else if (PyUnicode_IS_COMPACT(v))
13825 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013826 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013827 else {
13828 /* If it is a two-block object, account for base object, and
13829 for character block if present. */
13830 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013831 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013832 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013833 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013834 }
13835 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013836 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013837 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013838 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013839 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013840 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013841
13842 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013843}
13844
13845PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013847
13848static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013849unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013850{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013851 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013852 if (!copy)
13853 return NULL;
13854 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013855}
13856
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013858 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013859 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013860 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13861 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013862 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13863 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013864 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013865 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13866 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13867 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013868 {"expandtabs", (PyCFunction) unicode_expandtabs,
13869 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013870 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013871 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013872 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13873 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13874 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013875 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013876 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13877 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13878 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013879 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013880 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013881 {"splitlines", (PyCFunction) unicode_splitlines,
13882 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013883 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013884 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13885 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13886 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13887 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13888 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13889 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13890 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13891 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13892 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13893 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13894 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13895 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13896 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13897 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013898 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013899 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013900 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013901 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013902 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013903 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013904 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013905 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013906#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013907 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013908 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013909#endif
13910
Benjamin Peterson14339b62009-01-31 16:36:08 +000013911 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912 {NULL, NULL}
13913};
13914
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013915static PyObject *
13916unicode_mod(PyObject *v, PyObject *w)
13917{
Brian Curtindfc80e32011-08-10 20:28:54 -050013918 if (!PyUnicode_Check(v))
13919 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013920 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013921}
13922
13923static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013924 0, /*nb_add*/
13925 0, /*nb_subtract*/
13926 0, /*nb_multiply*/
13927 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013928};
13929
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013931 (lenfunc) unicode_length, /* sq_length */
13932 PyUnicode_Concat, /* sq_concat */
13933 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13934 (ssizeargfunc) unicode_getitem, /* sq_item */
13935 0, /* sq_slice */
13936 0, /* sq_ass_item */
13937 0, /* sq_ass_slice */
13938 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939};
13940
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013941static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013942unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013943{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013944 if (PyUnicode_READY(self) == -1)
13945 return NULL;
13946
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013947 if (PyIndex_Check(item)) {
13948 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013949 if (i == -1 && PyErr_Occurred())
13950 return NULL;
13951 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013952 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013953 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013954 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013955 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013956 PyObject *result;
13957 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013958 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013959 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013961 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013962 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013963 return NULL;
13964 }
13965
13966 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013967 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013968 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013969 slicelength == PyUnicode_GET_LENGTH(self)) {
13970 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013971 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013972 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013973 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013974 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013975 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013976 src_kind = PyUnicode_KIND(self);
13977 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013978 if (!PyUnicode_IS_ASCII(self)) {
13979 kind_limit = kind_maxchar_limit(src_kind);
13980 max_char = 0;
13981 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13982 ch = PyUnicode_READ(src_kind, src_data, cur);
13983 if (ch > max_char) {
13984 max_char = ch;
13985 if (max_char >= kind_limit)
13986 break;
13987 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013988 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013989 }
Victor Stinner55c99112011-10-13 01:17:06 +020013990 else
13991 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013992 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013993 if (result == NULL)
13994 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013995 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013996 dest_data = PyUnicode_DATA(result);
13997
13998 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013999 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14000 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014001 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014002 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014003 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014004 } else {
14005 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14006 return NULL;
14007 }
14008}
14009
14010static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 (lenfunc)unicode_length, /* mp_length */
14012 (binaryfunc)unicode_subscript, /* mp_subscript */
14013 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014014};
14015
Guido van Rossumd57fd912000-03-10 22:53:23 +000014016
Guido van Rossumd57fd912000-03-10 22:53:23 +000014017/* Helpers for PyUnicode_Format() */
14018
Victor Stinnera47082312012-10-04 02:19:54 +020014019struct unicode_formatter_t {
14020 PyObject *args;
14021 int args_owned;
14022 Py_ssize_t arglen, argidx;
14023 PyObject *dict;
14024
14025 enum PyUnicode_Kind fmtkind;
14026 Py_ssize_t fmtcnt, fmtpos;
14027 void *fmtdata;
14028 PyObject *fmtstr;
14029
14030 _PyUnicodeWriter writer;
14031};
14032
14033struct unicode_format_arg_t {
14034 Py_UCS4 ch;
14035 int flags;
14036 Py_ssize_t width;
14037 int prec;
14038 int sign;
14039};
14040
Guido van Rossumd57fd912000-03-10 22:53:23 +000014041static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014042unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014043{
Victor Stinnera47082312012-10-04 02:19:54 +020014044 Py_ssize_t argidx = ctx->argidx;
14045
14046 if (argidx < ctx->arglen) {
14047 ctx->argidx++;
14048 if (ctx->arglen < 0)
14049 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014050 else
Victor Stinnera47082312012-10-04 02:19:54 +020014051 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014052 }
14053 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014054 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014055 return NULL;
14056}
14057
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014058/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059
Victor Stinnera47082312012-10-04 02:19:54 +020014060/* Format a float into the writer if the writer is not NULL, or into *p_output
14061 otherwise.
14062
14063 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014064static int
Victor Stinnera47082312012-10-04 02:19:54 +020014065formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14066 PyObject **p_output,
14067 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014068{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014069 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014070 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014071 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014072 int prec;
14073 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014074
Guido van Rossumd57fd912000-03-10 22:53:23 +000014075 x = PyFloat_AsDouble(v);
14076 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014077 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014078
Victor Stinnera47082312012-10-04 02:19:54 +020014079 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014081 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014082
Victor Stinnera47082312012-10-04 02:19:54 +020014083 if (arg->flags & F_ALT)
14084 dtoa_flags = Py_DTSF_ALT;
14085 else
14086 dtoa_flags = 0;
14087 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014088 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089 return -1;
14090 len = strlen(p);
14091 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014092 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014093 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014094 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014095 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014096 }
14097 else
14098 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014099 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014100 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014101}
14102
Victor Stinnerd0880d52012-04-27 23:40:13 +020014103/* formatlong() emulates the format codes d, u, o, x and X, and
14104 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14105 * Python's regular ints.
14106 * Return value: a new PyUnicodeObject*, or NULL if error.
14107 * The output string is of the form
14108 * "-"? ("0x" | "0X")? digit+
14109 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14110 * set in flags. The case of hex digits will be correct,
14111 * There will be at least prec digits, zero-filled on the left if
14112 * necessary to get that many.
14113 * val object to be converted
14114 * flags bitmask of format flags; only F_ALT is looked at
14115 * prec minimum number of digits; 0-fill on left if needed
14116 * type a character in [duoxX]; u acts the same as d
14117 *
14118 * CAUTION: o, x and X conversions on regular ints can never
14119 * produce a '-' sign, but can for Python's unbounded ints.
14120 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014121PyObject *
14122_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014123{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014124 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014125 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014126 Py_ssize_t i;
14127 int sign; /* 1 if '-', else 0 */
14128 int len; /* number of characters */
14129 Py_ssize_t llen;
14130 int numdigits; /* len == numnondigits + numdigits */
14131 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014132
Victor Stinnerd0880d52012-04-27 23:40:13 +020014133 /* Avoid exceeding SSIZE_T_MAX */
14134 if (prec > INT_MAX-3) {
14135 PyErr_SetString(PyExc_OverflowError,
14136 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014137 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014138 }
14139
14140 assert(PyLong_Check(val));
14141
14142 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014143 default:
14144 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014145 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014146 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014147 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014148 /* int and int subclasses should print numerically when a numeric */
14149 /* format code is used (see issue18780) */
14150 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014151 break;
14152 case 'o':
14153 numnondigits = 2;
14154 result = PyNumber_ToBase(val, 8);
14155 break;
14156 case 'x':
14157 case 'X':
14158 numnondigits = 2;
14159 result = PyNumber_ToBase(val, 16);
14160 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014161 }
14162 if (!result)
14163 return NULL;
14164
14165 assert(unicode_modifiable(result));
14166 assert(PyUnicode_IS_READY(result));
14167 assert(PyUnicode_IS_ASCII(result));
14168
14169 /* To modify the string in-place, there can only be one reference. */
14170 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014171 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014172 PyErr_BadInternalCall();
14173 return NULL;
14174 }
14175 buf = PyUnicode_DATA(result);
14176 llen = PyUnicode_GET_LENGTH(result);
14177 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014178 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014179 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014180 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 return NULL;
14182 }
14183 len = (int)llen;
14184 sign = buf[0] == '-';
14185 numnondigits += sign;
14186 numdigits = len - numnondigits;
14187 assert(numdigits > 0);
14188
14189 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014190 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014191 (type == 'o' || type == 'x' || type == 'X'))) {
14192 assert(buf[sign] == '0');
14193 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14194 buf[sign+1] == 'o');
14195 numnondigits -= 2;
14196 buf += 2;
14197 len -= 2;
14198 if (sign)
14199 buf[0] = '-';
14200 assert(len == numnondigits + numdigits);
14201 assert(numdigits > 0);
14202 }
14203
14204 /* Fill with leading zeroes to meet minimum width. */
14205 if (prec > numdigits) {
14206 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14207 numnondigits + prec);
14208 char *b1;
14209 if (!r1) {
14210 Py_DECREF(result);
14211 return NULL;
14212 }
14213 b1 = PyBytes_AS_STRING(r1);
14214 for (i = 0; i < numnondigits; ++i)
14215 *b1++ = *buf++;
14216 for (i = 0; i < prec - numdigits; i++)
14217 *b1++ = '0';
14218 for (i = 0; i < numdigits; i++)
14219 *b1++ = *buf++;
14220 *b1 = '\0';
14221 Py_DECREF(result);
14222 result = r1;
14223 buf = PyBytes_AS_STRING(result);
14224 len = numnondigits + prec;
14225 }
14226
14227 /* Fix up case for hex conversions. */
14228 if (type == 'X') {
14229 /* Need to convert all lower case letters to upper case.
14230 and need to convert 0x to 0X (and -0x to -0X). */
14231 for (i = 0; i < len; i++)
14232 if (buf[i] >= 'a' && buf[i] <= 'x')
14233 buf[i] -= 'a'-'A';
14234 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014235 if (!PyUnicode_Check(result)
14236 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014237 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014238 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014239 Py_DECREF(result);
14240 result = unicode;
14241 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014242 else if (len != PyUnicode_GET_LENGTH(result)) {
14243 if (PyUnicode_Resize(&result, len) < 0)
14244 Py_CLEAR(result);
14245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014246 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014247}
14248
Ethan Furmandf3ed242014-01-05 06:50:30 -080014249/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014250 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014251 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014252 * -1 and raise an exception on error */
14253static int
Victor Stinnera47082312012-10-04 02:19:54 +020014254mainformatlong(PyObject *v,
14255 struct unicode_format_arg_t *arg,
14256 PyObject **p_output,
14257 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014258{
14259 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014260 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014261
14262 if (!PyNumber_Check(v))
14263 goto wrongtype;
14264
Ethan Furman9ab74802014-03-21 06:38:46 -070014265 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014266 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014267 if (type == 'o' || type == 'x' || type == 'X') {
14268 iobj = PyNumber_Index(v);
14269 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014270 if (PyErr_ExceptionMatches(PyExc_TypeError))
14271 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014272 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014273 }
14274 }
14275 else {
14276 iobj = PyNumber_Long(v);
14277 if (iobj == NULL ) {
14278 if (PyErr_ExceptionMatches(PyExc_TypeError))
14279 goto wrongtype;
14280 return -1;
14281 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014282 }
14283 assert(PyLong_Check(iobj));
14284 }
14285 else {
14286 iobj = v;
14287 Py_INCREF(iobj);
14288 }
14289
14290 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014291 && arg->width == -1 && arg->prec == -1
14292 && !(arg->flags & (F_SIGN | F_BLANK))
14293 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014294 {
14295 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014296 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014297 int base;
14298
Victor Stinnera47082312012-10-04 02:19:54 +020014299 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014300 {
14301 default:
14302 assert(0 && "'type' not in [diuoxX]");
14303 case 'd':
14304 case 'i':
14305 case 'u':
14306 base = 10;
14307 break;
14308 case 'o':
14309 base = 8;
14310 break;
14311 case 'x':
14312 case 'X':
14313 base = 16;
14314 break;
14315 }
14316
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014317 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14318 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014319 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014320 }
14321 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 return 1;
14323 }
14324
Ethan Furmanb95b5612015-01-23 20:05:18 -080014325 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014326 Py_DECREF(iobj);
14327 if (res == NULL)
14328 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014329 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014330 return 0;
14331
14332wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014333 switch(type)
14334 {
14335 case 'o':
14336 case 'x':
14337 case 'X':
14338 PyErr_Format(PyExc_TypeError,
14339 "%%%c format: an integer is required, "
14340 "not %.200s",
14341 type, Py_TYPE(v)->tp_name);
14342 break;
14343 default:
14344 PyErr_Format(PyExc_TypeError,
14345 "%%%c format: a number is required, "
14346 "not %.200s",
14347 type, Py_TYPE(v)->tp_name);
14348 break;
14349 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014350 return -1;
14351}
14352
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014353static Py_UCS4
14354formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014356 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014357 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014358 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014359 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014360 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014361 goto onError;
14362 }
14363 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014364 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014365 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014366 /* make sure number is a type of integer */
14367 if (!PyLong_Check(v)) {
14368 iobj = PyNumber_Index(v);
14369 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014370 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014371 }
14372 v = iobj;
14373 Py_DECREF(iobj);
14374 }
14375 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014376 x = PyLong_AsLong(v);
14377 if (x == -1 && PyErr_Occurred())
14378 goto onError;
14379
Victor Stinner8faf8212011-12-08 22:14:11 +010014380 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014381 PyErr_SetString(PyExc_OverflowError,
14382 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014383 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014384 }
14385
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014386 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014387 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014388
Benjamin Peterson29060642009-01-31 22:14:21 +000014389 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014390 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014391 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014392 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393}
14394
Victor Stinnera47082312012-10-04 02:19:54 +020014395/* Parse options of an argument: flags, width, precision.
14396 Handle also "%(name)" syntax.
14397
14398 Return 0 if the argument has been formatted into arg->str.
14399 Return 1 if the argument has been written into ctx->writer,
14400 Raise an exception and return -1 on error. */
14401static int
14402unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14403 struct unicode_format_arg_t *arg)
14404{
14405#define FORMAT_READ(ctx) \
14406 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14407
14408 PyObject *v;
14409
Victor Stinnera47082312012-10-04 02:19:54 +020014410 if (arg->ch == '(') {
14411 /* Get argument value from a dictionary. Example: "%(name)s". */
14412 Py_ssize_t keystart;
14413 Py_ssize_t keylen;
14414 PyObject *key;
14415 int pcount = 1;
14416
14417 if (ctx->dict == NULL) {
14418 PyErr_SetString(PyExc_TypeError,
14419 "format requires a mapping");
14420 return -1;
14421 }
14422 ++ctx->fmtpos;
14423 --ctx->fmtcnt;
14424 keystart = ctx->fmtpos;
14425 /* Skip over balanced parentheses */
14426 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14427 arg->ch = FORMAT_READ(ctx);
14428 if (arg->ch == ')')
14429 --pcount;
14430 else if (arg->ch == '(')
14431 ++pcount;
14432 ctx->fmtpos++;
14433 }
14434 keylen = ctx->fmtpos - keystart - 1;
14435 if (ctx->fmtcnt < 0 || pcount > 0) {
14436 PyErr_SetString(PyExc_ValueError,
14437 "incomplete format key");
14438 return -1;
14439 }
14440 key = PyUnicode_Substring(ctx->fmtstr,
14441 keystart, keystart + keylen);
14442 if (key == NULL)
14443 return -1;
14444 if (ctx->args_owned) {
14445 Py_DECREF(ctx->args);
14446 ctx->args_owned = 0;
14447 }
14448 ctx->args = PyObject_GetItem(ctx->dict, key);
14449 Py_DECREF(key);
14450 if (ctx->args == NULL)
14451 return -1;
14452 ctx->args_owned = 1;
14453 ctx->arglen = -1;
14454 ctx->argidx = -2;
14455 }
14456
14457 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014458 while (--ctx->fmtcnt >= 0) {
14459 arg->ch = FORMAT_READ(ctx);
14460 ctx->fmtpos++;
14461 switch (arg->ch) {
14462 case '-': arg->flags |= F_LJUST; continue;
14463 case '+': arg->flags |= F_SIGN; continue;
14464 case ' ': arg->flags |= F_BLANK; continue;
14465 case '#': arg->flags |= F_ALT; continue;
14466 case '0': arg->flags |= F_ZERO; continue;
14467 }
14468 break;
14469 }
14470
14471 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014472 if (arg->ch == '*') {
14473 v = unicode_format_getnextarg(ctx);
14474 if (v == NULL)
14475 return -1;
14476 if (!PyLong_Check(v)) {
14477 PyErr_SetString(PyExc_TypeError,
14478 "* wants int");
14479 return -1;
14480 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014481 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014482 if (arg->width == -1 && PyErr_Occurred())
14483 return -1;
14484 if (arg->width < 0) {
14485 arg->flags |= F_LJUST;
14486 arg->width = -arg->width;
14487 }
14488 if (--ctx->fmtcnt >= 0) {
14489 arg->ch = FORMAT_READ(ctx);
14490 ctx->fmtpos++;
14491 }
14492 }
14493 else if (arg->ch >= '0' && arg->ch <= '9') {
14494 arg->width = arg->ch - '0';
14495 while (--ctx->fmtcnt >= 0) {
14496 arg->ch = FORMAT_READ(ctx);
14497 ctx->fmtpos++;
14498 if (arg->ch < '0' || arg->ch > '9')
14499 break;
14500 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14501 mixing signed and unsigned comparison. Since arg->ch is between
14502 '0' and '9', casting to int is safe. */
14503 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14504 PyErr_SetString(PyExc_ValueError,
14505 "width too big");
14506 return -1;
14507 }
14508 arg->width = arg->width*10 + (arg->ch - '0');
14509 }
14510 }
14511
14512 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014513 if (arg->ch == '.') {
14514 arg->prec = 0;
14515 if (--ctx->fmtcnt >= 0) {
14516 arg->ch = FORMAT_READ(ctx);
14517 ctx->fmtpos++;
14518 }
14519 if (arg->ch == '*') {
14520 v = unicode_format_getnextarg(ctx);
14521 if (v == NULL)
14522 return -1;
14523 if (!PyLong_Check(v)) {
14524 PyErr_SetString(PyExc_TypeError,
14525 "* wants int");
14526 return -1;
14527 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014528 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014529 if (arg->prec == -1 && PyErr_Occurred())
14530 return -1;
14531 if (arg->prec < 0)
14532 arg->prec = 0;
14533 if (--ctx->fmtcnt >= 0) {
14534 arg->ch = FORMAT_READ(ctx);
14535 ctx->fmtpos++;
14536 }
14537 }
14538 else if (arg->ch >= '0' && arg->ch <= '9') {
14539 arg->prec = arg->ch - '0';
14540 while (--ctx->fmtcnt >= 0) {
14541 arg->ch = FORMAT_READ(ctx);
14542 ctx->fmtpos++;
14543 if (arg->ch < '0' || arg->ch > '9')
14544 break;
14545 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14546 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014547 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014548 return -1;
14549 }
14550 arg->prec = arg->prec*10 + (arg->ch - '0');
14551 }
14552 }
14553 }
14554
14555 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14556 if (ctx->fmtcnt >= 0) {
14557 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14558 if (--ctx->fmtcnt >= 0) {
14559 arg->ch = FORMAT_READ(ctx);
14560 ctx->fmtpos++;
14561 }
14562 }
14563 }
14564 if (ctx->fmtcnt < 0) {
14565 PyErr_SetString(PyExc_ValueError,
14566 "incomplete format");
14567 return -1;
14568 }
14569 return 0;
14570
14571#undef FORMAT_READ
14572}
14573
14574/* Format one argument. Supported conversion specifiers:
14575
14576 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014577 - "i", "d", "u": int or float
14578 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014579 - "e", "E", "f", "F", "g", "G": float
14580 - "c": int or str (1 character)
14581
Victor Stinner8dbd4212012-12-04 09:30:24 +010014582 When possible, the output is written directly into the Unicode writer
14583 (ctx->writer). A string is created when padding is required.
14584
Victor Stinnera47082312012-10-04 02:19:54 +020014585 Return 0 if the argument has been formatted into *p_str,
14586 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014587 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014588static int
14589unicode_format_arg_format(struct unicode_formatter_t *ctx,
14590 struct unicode_format_arg_t *arg,
14591 PyObject **p_str)
14592{
14593 PyObject *v;
14594 _PyUnicodeWriter *writer = &ctx->writer;
14595
14596 if (ctx->fmtcnt == 0)
14597 ctx->writer.overallocate = 0;
14598
14599 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014600 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014601 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014602 return 1;
14603 }
14604
14605 v = unicode_format_getnextarg(ctx);
14606 if (v == NULL)
14607 return -1;
14608
Victor Stinnera47082312012-10-04 02:19:54 +020014609
14610 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014611 case 's':
14612 case 'r':
14613 case 'a':
14614 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14615 /* Fast path */
14616 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14617 return -1;
14618 return 1;
14619 }
14620
14621 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14622 *p_str = v;
14623 Py_INCREF(*p_str);
14624 }
14625 else {
14626 if (arg->ch == 's')
14627 *p_str = PyObject_Str(v);
14628 else if (arg->ch == 'r')
14629 *p_str = PyObject_Repr(v);
14630 else
14631 *p_str = PyObject_ASCII(v);
14632 }
14633 break;
14634
14635 case 'i':
14636 case 'd':
14637 case 'u':
14638 case 'o':
14639 case 'x':
14640 case 'X':
14641 {
14642 int ret = mainformatlong(v, arg, p_str, writer);
14643 if (ret != 0)
14644 return ret;
14645 arg->sign = 1;
14646 break;
14647 }
14648
14649 case 'e':
14650 case 'E':
14651 case 'f':
14652 case 'F':
14653 case 'g':
14654 case 'G':
14655 if (arg->width == -1 && arg->prec == -1
14656 && !(arg->flags & (F_SIGN | F_BLANK)))
14657 {
14658 /* Fast path */
14659 if (formatfloat(v, arg, NULL, writer) == -1)
14660 return -1;
14661 return 1;
14662 }
14663
14664 arg->sign = 1;
14665 if (formatfloat(v, arg, p_str, NULL) == -1)
14666 return -1;
14667 break;
14668
14669 case 'c':
14670 {
14671 Py_UCS4 ch = formatchar(v);
14672 if (ch == (Py_UCS4) -1)
14673 return -1;
14674 if (arg->width == -1 && arg->prec == -1) {
14675 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014676 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014677 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014678 return 1;
14679 }
14680 *p_str = PyUnicode_FromOrdinal(ch);
14681 break;
14682 }
14683
14684 default:
14685 PyErr_Format(PyExc_ValueError,
14686 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014687 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014688 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14689 (int)arg->ch,
14690 ctx->fmtpos - 1);
14691 return -1;
14692 }
14693 if (*p_str == NULL)
14694 return -1;
14695 assert (PyUnicode_Check(*p_str));
14696 return 0;
14697}
14698
14699static int
14700unicode_format_arg_output(struct unicode_formatter_t *ctx,
14701 struct unicode_format_arg_t *arg,
14702 PyObject *str)
14703{
14704 Py_ssize_t len;
14705 enum PyUnicode_Kind kind;
14706 void *pbuf;
14707 Py_ssize_t pindex;
14708 Py_UCS4 signchar;
14709 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014710 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014711 Py_ssize_t sublen;
14712 _PyUnicodeWriter *writer = &ctx->writer;
14713 Py_UCS4 fill;
14714
14715 fill = ' ';
14716 if (arg->sign && arg->flags & F_ZERO)
14717 fill = '0';
14718
14719 if (PyUnicode_READY(str) == -1)
14720 return -1;
14721
14722 len = PyUnicode_GET_LENGTH(str);
14723 if ((arg->width == -1 || arg->width <= len)
14724 && (arg->prec == -1 || arg->prec >= len)
14725 && !(arg->flags & (F_SIGN | F_BLANK)))
14726 {
14727 /* Fast path */
14728 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14729 return -1;
14730 return 0;
14731 }
14732
14733 /* Truncate the string for "s", "r" and "a" formats
14734 if the precision is set */
14735 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14736 if (arg->prec >= 0 && len > arg->prec)
14737 len = arg->prec;
14738 }
14739
14740 /* Adjust sign and width */
14741 kind = PyUnicode_KIND(str);
14742 pbuf = PyUnicode_DATA(str);
14743 pindex = 0;
14744 signchar = '\0';
14745 if (arg->sign) {
14746 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14747 if (ch == '-' || ch == '+') {
14748 signchar = ch;
14749 len--;
14750 pindex++;
14751 }
14752 else if (arg->flags & F_SIGN)
14753 signchar = '+';
14754 else if (arg->flags & F_BLANK)
14755 signchar = ' ';
14756 else
14757 arg->sign = 0;
14758 }
14759 if (arg->width < len)
14760 arg->width = len;
14761
14762 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014763 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014764 if (!(arg->flags & F_LJUST)) {
14765 if (arg->sign) {
14766 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014767 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014768 }
14769 else {
14770 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014771 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014772 }
14773 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014774 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14775 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014776 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014777 }
14778
Victor Stinnera47082312012-10-04 02:19:54 +020014779 buflen = arg->width;
14780 if (arg->sign && len == arg->width)
14781 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014782 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014783 return -1;
14784
14785 /* Write the sign if needed */
14786 if (arg->sign) {
14787 if (fill != ' ') {
14788 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14789 writer->pos += 1;
14790 }
14791 if (arg->width > len)
14792 arg->width--;
14793 }
14794
14795 /* Write the numeric prefix for "x", "X" and "o" formats
14796 if the alternate form is used.
14797 For example, write "0x" for the "%#x" format. */
14798 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14799 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14800 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14801 if (fill != ' ') {
14802 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14803 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14804 writer->pos += 2;
14805 pindex += 2;
14806 }
14807 arg->width -= 2;
14808 if (arg->width < 0)
14809 arg->width = 0;
14810 len -= 2;
14811 }
14812
14813 /* Pad left with the fill character if needed */
14814 if (arg->width > len && !(arg->flags & F_LJUST)) {
14815 sublen = arg->width - len;
14816 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14817 writer->pos += sublen;
14818 arg->width = len;
14819 }
14820
14821 /* If padding with spaces: write sign if needed and/or numeric prefix if
14822 the alternate form is used */
14823 if (fill == ' ') {
14824 if (arg->sign) {
14825 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14826 writer->pos += 1;
14827 }
14828 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14829 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14830 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14831 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14833 writer->pos += 2;
14834 pindex += 2;
14835 }
14836 }
14837
14838 /* Write characters */
14839 if (len) {
14840 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14841 str, pindex, len);
14842 writer->pos += len;
14843 }
14844
14845 /* Pad right with the fill character if needed */
14846 if (arg->width > len) {
14847 sublen = arg->width - len;
14848 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14849 writer->pos += sublen;
14850 }
14851 return 0;
14852}
14853
14854/* Helper of PyUnicode_Format(): format one arg.
14855 Return 0 on success, raise an exception and return -1 on error. */
14856static int
14857unicode_format_arg(struct unicode_formatter_t *ctx)
14858{
14859 struct unicode_format_arg_t arg;
14860 PyObject *str;
14861 int ret;
14862
Victor Stinner8dbd4212012-12-04 09:30:24 +010014863 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14864 arg.flags = 0;
14865 arg.width = -1;
14866 arg.prec = -1;
14867 arg.sign = 0;
14868 str = NULL;
14869
Victor Stinnera47082312012-10-04 02:19:54 +020014870 ret = unicode_format_arg_parse(ctx, &arg);
14871 if (ret == -1)
14872 return -1;
14873
14874 ret = unicode_format_arg_format(ctx, &arg, &str);
14875 if (ret == -1)
14876 return -1;
14877
14878 if (ret != 1) {
14879 ret = unicode_format_arg_output(ctx, &arg, str);
14880 Py_DECREF(str);
14881 if (ret == -1)
14882 return -1;
14883 }
14884
14885 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14886 PyErr_SetString(PyExc_TypeError,
14887 "not all arguments converted during string formatting");
14888 return -1;
14889 }
14890 return 0;
14891}
14892
Alexander Belopolsky40018472011-02-26 01:02:56 +000014893PyObject *
14894PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014895{
Victor Stinnera47082312012-10-04 02:19:54 +020014896 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014897
Guido van Rossumd57fd912000-03-10 22:53:23 +000014898 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014899 PyErr_BadInternalCall();
14900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014901 }
Victor Stinnera47082312012-10-04 02:19:54 +020014902
14903 ctx.fmtstr = PyUnicode_FromObject(format);
14904 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014905 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014906 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14907 Py_DECREF(ctx.fmtstr);
14908 return NULL;
14909 }
14910 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14911 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14912 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14913 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014914
Victor Stinner8f674cc2013-04-17 23:02:17 +020014915 _PyUnicodeWriter_Init(&ctx.writer);
14916 ctx.writer.min_length = ctx.fmtcnt + 100;
14917 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014918
Guido van Rossumd57fd912000-03-10 22:53:23 +000014919 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014920 ctx.arglen = PyTuple_Size(args);
14921 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922 }
14923 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014924 ctx.arglen = -1;
14925 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014926 }
Victor Stinnera47082312012-10-04 02:19:54 +020014927 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014928 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014929 ctx.dict = args;
14930 else
14931 ctx.dict = NULL;
14932 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014933
Victor Stinnera47082312012-10-04 02:19:54 +020014934 while (--ctx.fmtcnt >= 0) {
14935 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014936 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014937
14938 nonfmtpos = ctx.fmtpos++;
14939 while (ctx.fmtcnt >= 0 &&
14940 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14941 ctx.fmtpos++;
14942 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014943 }
Victor Stinnera47082312012-10-04 02:19:54 +020014944 if (ctx.fmtcnt < 0) {
14945 ctx.fmtpos--;
14946 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014947 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014948
Victor Stinnercfc4c132013-04-03 01:48:39 +020014949 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14950 nonfmtpos, ctx.fmtpos) < 0)
14951 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014952 }
14953 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014954 ctx.fmtpos++;
14955 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014956 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014957 }
14958 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014959
Victor Stinnera47082312012-10-04 02:19:54 +020014960 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014961 PyErr_SetString(PyExc_TypeError,
14962 "not all arguments converted during string formatting");
14963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964 }
14965
Victor Stinnera47082312012-10-04 02:19:54 +020014966 if (ctx.args_owned) {
14967 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968 }
Victor Stinnera47082312012-10-04 02:19:54 +020014969 Py_DECREF(ctx.fmtstr);
14970 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014971
Benjamin Peterson29060642009-01-31 22:14:21 +000014972 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014973 Py_DECREF(ctx.fmtstr);
14974 _PyUnicodeWriter_Dealloc(&ctx.writer);
14975 if (ctx.args_owned) {
14976 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014977 }
14978 return NULL;
14979}
14980
Jeremy Hylton938ace62002-07-17 16:30:39 +000014981static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014982unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14983
Tim Peters6d6c1a32001-08-02 04:15:00 +000014984static PyObject *
14985unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14986{
Benjamin Peterson29060642009-01-31 22:14:21 +000014987 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014988 static char *kwlist[] = {"object", "encoding", "errors", 0};
14989 char *encoding = NULL;
14990 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014991
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 if (type != &PyUnicode_Type)
14993 return unicode_subtype_new(type, args, kwds);
14994 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014995 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014996 return NULL;
14997 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014998 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 if (encoding == NULL && errors == NULL)
15000 return PyObject_Str(x);
15001 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015002 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015003}
15004
Guido van Rossume023fe02001-08-30 03:12:59 +000015005static PyObject *
15006unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15007{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015008 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015009 Py_ssize_t length, char_size;
15010 int share_wstr, share_utf8;
15011 unsigned int kind;
15012 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015013
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015015
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015016 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015017 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015018 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015019 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015020 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015021 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015022 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015023 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015024
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015025 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015026 if (self == NULL) {
15027 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 return NULL;
15029 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015030 kind = PyUnicode_KIND(unicode);
15031 length = PyUnicode_GET_LENGTH(unicode);
15032
15033 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015034#ifdef Py_DEBUG
15035 _PyUnicode_HASH(self) = -1;
15036#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015037 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015038#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015039 _PyUnicode_STATE(self).interned = 0;
15040 _PyUnicode_STATE(self).kind = kind;
15041 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015042 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043 _PyUnicode_STATE(self).ready = 1;
15044 _PyUnicode_WSTR(self) = NULL;
15045 _PyUnicode_UTF8_LENGTH(self) = 0;
15046 _PyUnicode_UTF8(self) = NULL;
15047 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015048 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049
15050 share_utf8 = 0;
15051 share_wstr = 0;
15052 if (kind == PyUnicode_1BYTE_KIND) {
15053 char_size = 1;
15054 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15055 share_utf8 = 1;
15056 }
15057 else if (kind == PyUnicode_2BYTE_KIND) {
15058 char_size = 2;
15059 if (sizeof(wchar_t) == 2)
15060 share_wstr = 1;
15061 }
15062 else {
15063 assert(kind == PyUnicode_4BYTE_KIND);
15064 char_size = 4;
15065 if (sizeof(wchar_t) == 4)
15066 share_wstr = 1;
15067 }
15068
15069 /* Ensure we won't overflow the length. */
15070 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15071 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015072 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015073 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015074 data = PyObject_MALLOC((length + 1) * char_size);
15075 if (data == NULL) {
15076 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015077 goto onError;
15078 }
15079
Victor Stinnerc3c74152011-10-02 20:39:55 +020015080 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015081 if (share_utf8) {
15082 _PyUnicode_UTF8_LENGTH(self) = length;
15083 _PyUnicode_UTF8(self) = data;
15084 }
15085 if (share_wstr) {
15086 _PyUnicode_WSTR_LENGTH(self) = length;
15087 _PyUnicode_WSTR(self) = (wchar_t *)data;
15088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015089
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015090 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015091 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015092 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015093#ifdef Py_DEBUG
15094 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15095#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015096 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015097 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015098
15099onError:
15100 Py_DECREF(unicode);
15101 Py_DECREF(self);
15102 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015103}
15104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015105PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015106"str(object='') -> str\n\
15107str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015108\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015109Create a new string object from the given object. If encoding or\n\
15110errors is specified, then the object must expose a data buffer\n\
15111that will be decoded using the given encoding and error handler.\n\
15112Otherwise, returns the result of object.__str__() (if defined)\n\
15113or repr(object).\n\
15114encoding defaults to sys.getdefaultencoding().\n\
15115errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015116
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015117static PyObject *unicode_iter(PyObject *seq);
15118
Guido van Rossumd57fd912000-03-10 22:53:23 +000015119PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015120 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 "str", /* tp_name */
15122 sizeof(PyUnicodeObject), /* tp_size */
15123 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015124 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 (destructor)unicode_dealloc, /* tp_dealloc */
15126 0, /* tp_print */
15127 0, /* tp_getattr */
15128 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015129 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 unicode_repr, /* tp_repr */
15131 &unicode_as_number, /* tp_as_number */
15132 &unicode_as_sequence, /* tp_as_sequence */
15133 &unicode_as_mapping, /* tp_as_mapping */
15134 (hashfunc) unicode_hash, /* tp_hash*/
15135 0, /* tp_call*/
15136 (reprfunc) unicode_str, /* tp_str */
15137 PyObject_GenericGetAttr, /* tp_getattro */
15138 0, /* tp_setattro */
15139 0, /* tp_as_buffer */
15140 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015141 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 unicode_doc, /* tp_doc */
15143 0, /* tp_traverse */
15144 0, /* tp_clear */
15145 PyUnicode_RichCompare, /* tp_richcompare */
15146 0, /* tp_weaklistoffset */
15147 unicode_iter, /* tp_iter */
15148 0, /* tp_iternext */
15149 unicode_methods, /* tp_methods */
15150 0, /* tp_members */
15151 0, /* tp_getset */
15152 &PyBaseObject_Type, /* tp_base */
15153 0, /* tp_dict */
15154 0, /* tp_descr_get */
15155 0, /* tp_descr_set */
15156 0, /* tp_dictoffset */
15157 0, /* tp_init */
15158 0, /* tp_alloc */
15159 unicode_new, /* tp_new */
15160 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015161};
15162
15163/* Initialize the Unicode implementation */
15164
Victor Stinner3a50e702011-10-18 21:21:00 +020015165int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015166{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015167 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015168 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015169 0x000A, /* LINE FEED */
15170 0x000D, /* CARRIAGE RETURN */
15171 0x001C, /* FILE SEPARATOR */
15172 0x001D, /* GROUP SEPARATOR */
15173 0x001E, /* RECORD SEPARATOR */
15174 0x0085, /* NEXT LINE */
15175 0x2028, /* LINE SEPARATOR */
15176 0x2029, /* PARAGRAPH SEPARATOR */
15177 };
15178
Fred Drakee4315f52000-05-09 19:53:39 +000015179 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015180 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015181 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015182 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015183 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015184
Guido van Rossumcacfc072002-05-24 19:01:59 +000015185 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015186 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015187
15188 /* initialize the linebreak bloom filter */
15189 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015190 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015191 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015192
Christian Heimes26532f72013-07-20 14:57:16 +020015193 if (PyType_Ready(&EncodingMapType) < 0)
15194 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015195
Benjamin Petersonc4311282012-10-30 23:21:10 -040015196 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15197 Py_FatalError("Can't initialize field name iterator type");
15198
15199 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15200 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015201
Victor Stinner3a50e702011-10-18 21:21:00 +020015202 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015203}
15204
15205/* Finalize the Unicode implementation */
15206
Christian Heimesa156e092008-02-16 07:38:31 +000015207int
15208PyUnicode_ClearFreeList(void)
15209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015210 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015211}
15212
Guido van Rossumd57fd912000-03-10 22:53:23 +000015213void
Thomas Wouters78890102000-07-22 19:25:51 +000015214_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015215{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015216 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015217
Serhiy Storchaka05997252013-01-26 12:14:02 +020015218 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015219
Serhiy Storchaka05997252013-01-26 12:14:02 +020015220 for (i = 0; i < 256; i++)
15221 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015222 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015223 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015224}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015225
Walter Dörwald16807132007-05-25 13:52:07 +000015226void
15227PyUnicode_InternInPlace(PyObject **p)
15228{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015229 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015230 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015231#ifdef Py_DEBUG
15232 assert(s != NULL);
15233 assert(_PyUnicode_CHECK(s));
15234#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015236 return;
15237#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015238 /* If it's a subclass, we don't really know what putting
15239 it in the interned dict might do. */
15240 if (!PyUnicode_CheckExact(s))
15241 return;
15242 if (PyUnicode_CHECK_INTERNED(s))
15243 return;
15244 if (interned == NULL) {
15245 interned = PyDict_New();
15246 if (interned == NULL) {
15247 PyErr_Clear(); /* Don't leave an exception */
15248 return;
15249 }
15250 }
15251 /* It might be that the GetItem call fails even
15252 though the key is present in the dictionary,
15253 namely when this happens during a stack overflow. */
15254 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015255 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015257
Victor Stinnerf0335102013-04-14 19:13:03 +020015258 if (t) {
15259 Py_INCREF(t);
15260 Py_DECREF(*p);
15261 *p = t;
15262 return;
15263 }
Walter Dörwald16807132007-05-25 13:52:07 +000015264
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015266 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 PyErr_Clear();
15268 PyThreadState_GET()->recursion_critical = 0;
15269 return;
15270 }
15271 PyThreadState_GET()->recursion_critical = 0;
15272 /* The two references in interned are not counted by refcnt.
15273 The deallocator will take care of this */
15274 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015275 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015276}
15277
15278void
15279PyUnicode_InternImmortal(PyObject **p)
15280{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 PyUnicode_InternInPlace(p);
15282 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015283 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 Py_INCREF(*p);
15285 }
Walter Dörwald16807132007-05-25 13:52:07 +000015286}
15287
15288PyObject *
15289PyUnicode_InternFromString(const char *cp)
15290{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 PyObject *s = PyUnicode_FromString(cp);
15292 if (s == NULL)
15293 return NULL;
15294 PyUnicode_InternInPlace(&s);
15295 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015296}
15297
Alexander Belopolsky40018472011-02-26 01:02:56 +000015298void
15299_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015300{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015302 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 Py_ssize_t i, n;
15304 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015305
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 if (interned == NULL || !PyDict_Check(interned))
15307 return;
15308 keys = PyDict_Keys(interned);
15309 if (keys == NULL || !PyList_Check(keys)) {
15310 PyErr_Clear();
15311 return;
15312 }
Walter Dörwald16807132007-05-25 13:52:07 +000015313
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15315 detector, interned unicode strings are not forcibly deallocated;
15316 rather, we give them their stolen references back, and then clear
15317 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015318
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 n = PyList_GET_SIZE(keys);
15320 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015321 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015323 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015324 if (PyUnicode_READY(s) == -1) {
15325 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015326 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015328 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 case SSTATE_NOT_INTERNED:
15330 /* XXX Shouldn't happen */
15331 break;
15332 case SSTATE_INTERNED_IMMORTAL:
15333 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 break;
15336 case SSTATE_INTERNED_MORTAL:
15337 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015338 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 break;
15340 default:
15341 Py_FatalError("Inconsistent interned string state.");
15342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 }
15345 fprintf(stderr, "total size of all interned strings: "
15346 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15347 "mortal/immortal\n", mortal_size, immortal_size);
15348 Py_DECREF(keys);
15349 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015350 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015351}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015352
15353
15354/********************* Unicode Iterator **************************/
15355
15356typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 PyObject_HEAD
15358 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015359 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015360} unicodeiterobject;
15361
15362static void
15363unicodeiter_dealloc(unicodeiterobject *it)
15364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 _PyObject_GC_UNTRACK(it);
15366 Py_XDECREF(it->it_seq);
15367 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015368}
15369
15370static int
15371unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15372{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 Py_VISIT(it->it_seq);
15374 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015375}
15376
15377static PyObject *
15378unicodeiter_next(unicodeiterobject *it)
15379{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015380 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015381
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 assert(it != NULL);
15383 seq = it->it_seq;
15384 if (seq == NULL)
15385 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015386 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015388 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15389 int kind = PyUnicode_KIND(seq);
15390 void *data = PyUnicode_DATA(seq);
15391 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15392 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 if (item != NULL)
15394 ++it->it_index;
15395 return item;
15396 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015397
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 Py_DECREF(seq);
15399 it->it_seq = NULL;
15400 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401}
15402
15403static PyObject *
15404unicodeiter_len(unicodeiterobject *it)
15405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 Py_ssize_t len = 0;
15407 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015408 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410}
15411
15412PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15413
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015414static PyObject *
15415unicodeiter_reduce(unicodeiterobject *it)
15416{
15417 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015418 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015419 it->it_seq, it->it_index);
15420 } else {
15421 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15422 if (u == NULL)
15423 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015424 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015425 }
15426}
15427
15428PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15429
15430static PyObject *
15431unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15432{
15433 Py_ssize_t index = PyLong_AsSsize_t(state);
15434 if (index == -1 && PyErr_Occurred())
15435 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015436 if (it->it_seq != NULL) {
15437 if (index < 0)
15438 index = 0;
15439 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15440 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15441 it->it_index = index;
15442 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015443 Py_RETURN_NONE;
15444}
15445
15446PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15447
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015448static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015450 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15452 reduce_doc},
15453 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15454 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015456};
15457
15458PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15460 "str_iterator", /* tp_name */
15461 sizeof(unicodeiterobject), /* tp_basicsize */
15462 0, /* tp_itemsize */
15463 /* methods */
15464 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15465 0, /* tp_print */
15466 0, /* tp_getattr */
15467 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015468 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 0, /* tp_repr */
15470 0, /* tp_as_number */
15471 0, /* tp_as_sequence */
15472 0, /* tp_as_mapping */
15473 0, /* tp_hash */
15474 0, /* tp_call */
15475 0, /* tp_str */
15476 PyObject_GenericGetAttr, /* tp_getattro */
15477 0, /* tp_setattro */
15478 0, /* tp_as_buffer */
15479 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15480 0, /* tp_doc */
15481 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15482 0, /* tp_clear */
15483 0, /* tp_richcompare */
15484 0, /* tp_weaklistoffset */
15485 PyObject_SelfIter, /* tp_iter */
15486 (iternextfunc)unicodeiter_next, /* tp_iternext */
15487 unicodeiter_methods, /* tp_methods */
15488 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015489};
15490
15491static PyObject *
15492unicode_iter(PyObject *seq)
15493{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015495
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 if (!PyUnicode_Check(seq)) {
15497 PyErr_BadInternalCall();
15498 return NULL;
15499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015500 if (PyUnicode_READY(seq) == -1)
15501 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15503 if (it == NULL)
15504 return NULL;
15505 it->it_index = 0;
15506 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015507 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 _PyObject_GC_TRACK(it);
15509 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015510}
15511
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015512
15513size_t
15514Py_UNICODE_strlen(const Py_UNICODE *u)
15515{
15516 int res = 0;
15517 while(*u++)
15518 res++;
15519 return res;
15520}
15521
15522Py_UNICODE*
15523Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15524{
15525 Py_UNICODE *u = s1;
15526 while ((*u++ = *s2++));
15527 return s1;
15528}
15529
15530Py_UNICODE*
15531Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15532{
15533 Py_UNICODE *u = s1;
15534 while ((*u++ = *s2++))
15535 if (n-- == 0)
15536 break;
15537 return s1;
15538}
15539
15540Py_UNICODE*
15541Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15542{
15543 Py_UNICODE *u1 = s1;
15544 u1 += Py_UNICODE_strlen(u1);
15545 Py_UNICODE_strcpy(u1, s2);
15546 return s1;
15547}
15548
15549int
15550Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15551{
15552 while (*s1 && *s2 && *s1 == *s2)
15553 s1++, s2++;
15554 if (*s1 && *s2)
15555 return (*s1 < *s2) ? -1 : +1;
15556 if (*s1)
15557 return 1;
15558 if (*s2)
15559 return -1;
15560 return 0;
15561}
15562
15563int
15564Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15565{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015566 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015567 for (; n != 0; n--) {
15568 u1 = *s1;
15569 u2 = *s2;
15570 if (u1 != u2)
15571 return (u1 < u2) ? -1 : +1;
15572 if (u1 == '\0')
15573 return 0;
15574 s1++;
15575 s2++;
15576 }
15577 return 0;
15578}
15579
15580Py_UNICODE*
15581Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15582{
15583 const Py_UNICODE *p;
15584 for (p = s; *p; p++)
15585 if (*p == c)
15586 return (Py_UNICODE*)p;
15587 return NULL;
15588}
15589
15590Py_UNICODE*
15591Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15592{
15593 const Py_UNICODE *p;
15594 p = s + Py_UNICODE_strlen(s);
15595 while (p != s) {
15596 p--;
15597 if (*p == c)
15598 return (Py_UNICODE*)p;
15599 }
15600 return NULL;
15601}
Victor Stinner331ea922010-08-10 16:37:20 +000015602
Victor Stinner71133ff2010-09-01 23:43:53 +000015603Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015604PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015605{
Victor Stinner577db2c2011-10-11 22:12:48 +020015606 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015607 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015609 if (!PyUnicode_Check(unicode)) {
15610 PyErr_BadArgument();
15611 return NULL;
15612 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015613 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015614 if (u == NULL)
15615 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015616 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015617 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015618 PyErr_NoMemory();
15619 return NULL;
15620 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015621 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015622 size *= sizeof(Py_UNICODE);
15623 copy = PyMem_Malloc(size);
15624 if (copy == NULL) {
15625 PyErr_NoMemory();
15626 return NULL;
15627 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015628 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015629 return copy;
15630}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015631
Georg Brandl66c221e2010-10-14 07:04:07 +000015632/* A _string module, to export formatter_parser and formatter_field_name_split
15633 to the string.Formatter class implemented in Python. */
15634
15635static PyMethodDef _string_methods[] = {
15636 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15637 METH_O, PyDoc_STR("split the argument as a field name")},
15638 {"formatter_parser", (PyCFunction) formatter_parser,
15639 METH_O, PyDoc_STR("parse the argument as a format string")},
15640 {NULL, NULL}
15641};
15642
15643static struct PyModuleDef _string_module = {
15644 PyModuleDef_HEAD_INIT,
15645 "_string",
15646 PyDoc_STR("string helper module"),
15647 0,
15648 _string_methods,
15649 NULL,
15650 NULL,
15651 NULL,
15652 NULL
15653};
15654
15655PyMODINIT_FUNC
15656PyInit__string(void)
15657{
15658 return PyModule_Create(&_string_module);
15659}
15660
15661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015662#ifdef __cplusplus
15663}
15664#endif