blob: a3bbf9261f9565687a99bc18ed520cecc20693e1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
275static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
321 if (strcmp(errors, "surrogateescape") == 0)
322 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner50149202015-09-22 00:26:54 +0200323 if (strcmp(errors, "replace") == 0)
324 return _Py_ERROR_REPLACE;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200325 if (strcmp(errors, "ignore") == 0)
326 return _Py_ERROR_IGNORE;
327 if (strcmp(errors, "backslashreplace") == 0)
328 return _Py_ERROR_BACKSLASHREPLACE;
329 if (strcmp(errors, "surrogatepass") == 0)
330 return _Py_ERROR_SURROGATEPASS;
Victor Stinner50149202015-09-22 00:26:54 +0200331 if (strcmp(errors, "xmlcharrefreplace") == 0)
332 return _Py_ERROR_XMLCHARREFREPLACE;
333 return _Py_ERROR_OTHER;
334}
335
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300336/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
337 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000338Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000339PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000340{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000341#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000343#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 /* This is actually an illegal character, so it should
345 not be passed to unichr. */
346 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347#endif
348}
349
Victor Stinner910337b2011-10-03 03:20:16 +0200350#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200351int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100352_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200353{
354 PyASCIIObject *ascii;
355 unsigned int kind;
356
357 assert(PyUnicode_Check(op));
358
359 ascii = (PyASCIIObject *)op;
360 kind = ascii->state.kind;
361
Victor Stinnera3b334d2011-10-03 13:53:37 +0200362 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200363 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200364 assert(ascii->state.ready == 1);
365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200367 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200368 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200369
Victor Stinnera41463c2011-10-04 01:05:08 +0200370 if (ascii->state.compact == 1) {
371 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200372 assert(kind == PyUnicode_1BYTE_KIND
373 || kind == PyUnicode_2BYTE_KIND
374 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200375 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200376 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100378 }
379 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
381
382 data = unicode->data.any;
383 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100384 assert(ascii->length == 0);
385 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200386 assert(ascii->state.compact == 0);
387 assert(ascii->state.ascii == 0);
388 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100389 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 assert(ascii->wstr != NULL);
391 assert(data == NULL);
392 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 }
394 else {
395 assert(kind == PyUnicode_1BYTE_KIND
396 || kind == PyUnicode_2BYTE_KIND
397 || kind == PyUnicode_4BYTE_KIND);
398 assert(ascii->state.compact == 0);
399 assert(ascii->state.ready == 1);
400 assert(data != NULL);
401 if (ascii->state.ascii) {
402 assert (compact->utf8 == data);
403 assert (compact->utf8_length == ascii->length);
404 }
405 else
406 assert (compact->utf8 != data);
407 }
408 }
409 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200410 if (
411#if SIZEOF_WCHAR_T == 2
412 kind == PyUnicode_2BYTE_KIND
413#else
414 kind == PyUnicode_4BYTE_KIND
415#endif
416 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 {
418 assert(ascii->wstr == data);
419 assert(compact->wstr_length == ascii->length);
420 } else
421 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200422 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200423
424 if (compact->utf8 == NULL)
425 assert(compact->utf8_length == 0);
426 if (ascii->wstr == NULL)
427 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200429 /* check that the best kind is used */
430 if (check_content && kind != PyUnicode_WCHAR_KIND)
431 {
432 Py_ssize_t i;
433 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200434 void *data;
435 Py_UCS4 ch;
436
437 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200438 for (i=0; i < ascii->length; i++)
439 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200440 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200441 if (ch > maxchar)
442 maxchar = ch;
443 }
444 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100445 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200446 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100447 assert(maxchar <= 255);
448 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200449 else
450 assert(maxchar < 128);
451 }
Victor Stinner77faf692011-11-20 18:56:05 +0100452 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 0xFFFF);
455 }
456 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200457 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100458 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100459 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200460 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200461 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400462 return 1;
463}
Victor Stinner910337b2011-10-03 03:20:16 +0200464#endif
465
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466static PyObject*
467unicode_result_wchar(PyObject *unicode)
468{
469#ifndef Py_DEBUG
470 Py_ssize_t len;
471
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 len = _PyUnicode_WSTR_LENGTH(unicode);
473 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200475 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 }
477
478 if (len == 1) {
479 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100480 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
482 Py_DECREF(unicode);
483 return latin1_char;
484 }
485 }
486
487 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200488 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100489 return NULL;
490 }
491#else
Victor Stinneraa771272012-10-04 02:32:58 +0200492 assert(Py_REFCNT(unicode) == 1);
493
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100494 /* don't make the result ready in debug mode to ensure that the caller
495 makes the string ready before using it */
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497#endif
498 return unicode;
499}
500
501static PyObject*
502unicode_result_ready(PyObject *unicode)
503{
504 Py_ssize_t length;
505
506 length = PyUnicode_GET_LENGTH(unicode);
507 if (length == 0) {
508 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100509 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100511 }
512 return unicode_empty;
513 }
514
515 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200516 void *data = PyUnicode_DATA(unicode);
517 int kind = PyUnicode_KIND(unicode);
518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100519 if (ch < 256) {
520 PyObject *latin1_char = unicode_latin1[ch];
521 if (latin1_char != NULL) {
522 if (unicode != latin1_char) {
523 Py_INCREF(latin1_char);
524 Py_DECREF(unicode);
525 }
526 return latin1_char;
527 }
528 else {
529 assert(_PyUnicode_CheckConsistency(unicode, 1));
530 Py_INCREF(unicode);
531 unicode_latin1[ch] = unicode;
532 return unicode;
533 }
534 }
535 }
536
537 assert(_PyUnicode_CheckConsistency(unicode, 1));
538 return unicode;
539}
540
541static PyObject*
542unicode_result(PyObject *unicode)
543{
544 assert(_PyUnicode_CHECK(unicode));
545 if (PyUnicode_IS_READY(unicode))
546 return unicode_result_ready(unicode);
547 else
548 return unicode_result_wchar(unicode);
549}
550
Victor Stinnerc4b49542011-12-11 22:44:26 +0100551static PyObject*
552unicode_result_unchanged(PyObject *unicode)
553{
554 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500555 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100556 return NULL;
557 Py_INCREF(unicode);
558 return unicode;
559 }
560 else
561 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100562 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563}
564
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200565/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
566 ASCII, Latin1, UTF-8, etc. */
567static char*
568backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
569 char *str,
570 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
571{
572 Py_ssize_t size, i, prealloc;
573 Py_UCS4 ch;
574 enum PyUnicode_Kind kind;
575 void *data;
576
577 assert(PyUnicode_IS_READY(unicode));
578 kind = PyUnicode_KIND(unicode);
579 data = PyUnicode_DATA(unicode);
580
581 size = 0;
582 /* determine replacement size */
583 for (i = collstart; i < collend; ++i) {
584 Py_ssize_t incr;
585
586 ch = PyUnicode_READ(kind, data, i);
587 if (ch < 0x100)
588 incr = 2+2;
589 else if (ch < 0x10000)
590 incr = 2+4;
591 else {
592 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200593 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200594 }
595 if (size > PY_SSIZE_T_MAX - incr) {
596 PyErr_SetString(PyExc_OverflowError,
597 "encoded result is too long for a Python string");
598 return NULL;
599 }
600 size += incr;
601 }
602
603 prealloc = prealloc_per_char * (collend - collstart);
604 if (size > prealloc) {
605 str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
606 if (str == NULL)
607 return NULL;
608 }
609
610 /* generate replacement */
611 for (i = collstart; i < collend; ++i) {
612 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200613 *str++ = '\\';
614 if (ch >= 0x00010000) {
615 *str++ = 'U';
616 *str++ = Py_hexdigits[(ch>>28)&0xf];
617 *str++ = Py_hexdigits[(ch>>24)&0xf];
618 *str++ = Py_hexdigits[(ch>>20)&0xf];
619 *str++ = Py_hexdigits[(ch>>16)&0xf];
620 *str++ = Py_hexdigits[(ch>>12)&0xf];
621 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200622 }
Victor Stinner797485e2015-10-09 03:17:30 +0200623 else if (ch >= 0x100) {
624 *str++ = 'u';
625 *str++ = Py_hexdigits[(ch>>12)&0xf];
626 *str++ = Py_hexdigits[(ch>>8)&0xf];
627 }
628 else
629 *str++ = 'x';
630 *str++ = Py_hexdigits[(ch>>4)&0xf];
631 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200632 }
633 return str;
634}
635
636/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
637 ASCII, Latin1, UTF-8, etc. */
638static char*
639xmlcharrefreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char,
640 char *str,
641 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
642{
643 Py_ssize_t size, i, prealloc;
644 Py_UCS4 ch;
645 enum PyUnicode_Kind kind;
646 void *data;
647
648 assert(PyUnicode_IS_READY(unicode));
649 kind = PyUnicode_KIND(unicode);
650 data = PyUnicode_DATA(unicode);
651
652 size = 0;
653 /* determine replacement size */
654 for (i = collstart; i < collend; ++i) {
655 Py_ssize_t incr;
656
657 ch = PyUnicode_READ(kind, data, i);
658 if (ch < 10)
659 incr = 2+1+1;
660 else if (ch < 100)
661 incr = 2+2+1;
662 else if (ch < 1000)
663 incr = 2+3+1;
664 else if (ch < 10000)
665 incr = 2+4+1;
666 else if (ch < 100000)
667 incr = 2+5+1;
668 else if (ch < 1000000)
669 incr = 2+6+1;
670 else {
671 assert(ch <= MAX_UNICODE);
672 incr = 2+7+1;
673 }
674 if (size > PY_SSIZE_T_MAX - incr) {
675 PyErr_SetString(PyExc_OverflowError,
676 "encoded result is too long for a Python string");
677 return NULL;
678 }
679 size += incr;
680 }
681
682 prealloc = prealloc_per_char * (collend - collstart);
683 if (size > prealloc) {
684 str = _PyBytesWriter_Prepare(writer, str, size - prealloc);
685 if (str == NULL)
686 return NULL;
687 }
688
689 /* generate replacement */
690 for (i = collstart; i < collend; ++i) {
691 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
692 }
693 return str;
694}
695
Thomas Wouters477c8d52006-05-27 19:21:47 +0000696/* --- Bloom Filters ----------------------------------------------------- */
697
698/* stuff to implement simple "bloom filters" for Unicode characters.
699 to keep things simple, we use a single bitmask, using the least 5
700 bits from each unicode characters as the bit index. */
701
702/* the linebreak mask is set up by Unicode_Init below */
703
Antoine Pitrouf068f942010-01-13 14:19:12 +0000704#if LONG_BIT >= 128
705#define BLOOM_WIDTH 128
706#elif LONG_BIT >= 64
707#define BLOOM_WIDTH 64
708#elif LONG_BIT >= 32
709#define BLOOM_WIDTH 32
710#else
711#error "LONG_BIT is smaller than 32"
712#endif
713
Thomas Wouters477c8d52006-05-27 19:21:47 +0000714#define BLOOM_MASK unsigned long
715
Serhiy Storchaka05997252013-01-26 12:14:02 +0200716static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000717
Antoine Pitrouf068f942010-01-13 14:19:12 +0000718#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000719
Benjamin Peterson29060642009-01-31 22:14:21 +0000720#define BLOOM_LINEBREAK(ch) \
721 ((ch) < 128U ? ascii_linebreak[(ch)] : \
722 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000723
Alexander Belopolsky40018472011-02-26 01:02:56 +0000724Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200725make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000726{
Victor Stinnera85af502013-04-09 21:53:54 +0200727#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
728 do { \
729 TYPE *data = (TYPE *)PTR; \
730 TYPE *end = data + LEN; \
731 Py_UCS4 ch; \
732 for (; data != end; data++) { \
733 ch = *data; \
734 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
735 } \
736 break; \
737 } while (0)
738
Thomas Wouters477c8d52006-05-27 19:21:47 +0000739 /* calculate simple bloom-style bitmask for a given unicode string */
740
Antoine Pitrouf068f942010-01-13 14:19:12 +0000741 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000742
743 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200744 switch (kind) {
745 case PyUnicode_1BYTE_KIND:
746 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
747 break;
748 case PyUnicode_2BYTE_KIND:
749 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
750 break;
751 case PyUnicode_4BYTE_KIND:
752 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
753 break;
754 default:
755 assert(0);
756 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000757 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200758
759#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000760}
761
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200762/* Compilation of templated routines */
763
764#include "stringlib/asciilib.h"
765#include "stringlib/fastsearch.h"
766#include "stringlib/partition.h"
767#include "stringlib/split.h"
768#include "stringlib/count.h"
769#include "stringlib/find.h"
770#include "stringlib/find_max_char.h"
771#include "stringlib/localeutil.h"
772#include "stringlib/undef.h"
773
774#include "stringlib/ucs1lib.h"
775#include "stringlib/fastsearch.h"
776#include "stringlib/partition.h"
777#include "stringlib/split.h"
778#include "stringlib/count.h"
779#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300780#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs2lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs4lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200807#include "stringlib/unicodedefs.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/count.h"
810#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100811#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200812
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813/* --- Unicode Object ----------------------------------------------------- */
814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200816fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200818Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200819 Py_ssize_t size, Py_UCS4 ch,
820 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200822 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
823
824 switch (kind) {
825 case PyUnicode_1BYTE_KIND:
826 {
827 Py_UCS1 ch1 = (Py_UCS1) ch;
828 if (ch1 == ch)
829 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
830 else
831 return -1;
832 }
833 case PyUnicode_2BYTE_KIND:
834 {
835 Py_UCS2 ch2 = (Py_UCS2) ch;
836 if (ch2 == ch)
837 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
838 else
839 return -1;
840 }
841 case PyUnicode_4BYTE_KIND:
842 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
843 default:
844 assert(0);
845 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847}
848
Victor Stinnerafffce42012-10-03 23:03:17 +0200849#ifdef Py_DEBUG
850/* Fill the data of an Unicode string with invalid characters to detect bugs
851 earlier.
852
853 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
854 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
855 invalid character in Unicode 6.0. */
856static void
857unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
858{
859 int kind = PyUnicode_KIND(unicode);
860 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
861 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
862 if (length <= old_length)
863 return;
864 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
865}
866#endif
867
Victor Stinnerfe226c02011-10-03 03:52:20 +0200868static PyObject*
869resize_compact(PyObject *unicode, Py_ssize_t length)
870{
871 Py_ssize_t char_size;
872 Py_ssize_t struct_size;
873 Py_ssize_t new_size;
874 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100875 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
877 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
878#endif
879
Victor Stinner79891572012-05-03 13:43:07 +0200880 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200881 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100882 assert(PyUnicode_IS_COMPACT(unicode));
883
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200884 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100885 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200886 struct_size = sizeof(PyASCIIObject);
887 else
888 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200889 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200890
Victor Stinnerfe226c02011-10-03 03:52:20 +0200891 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
892 PyErr_NoMemory();
893 return NULL;
894 }
895 new_size = (struct_size + (length + 1) * char_size);
896
Victor Stinner84def372011-12-11 20:04:56 +0100897 _Py_DEC_REFTOTAL;
898 _Py_ForgetReference(unicode);
899
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300900 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100901 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100902 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200903 PyErr_NoMemory();
904 return NULL;
905 }
Victor Stinner84def372011-12-11 20:04:56 +0100906 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200907 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100908
Victor Stinnerfe226c02011-10-03 03:52:20 +0200909 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200910 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200911 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200913 _PyUnicode_WSTR_LENGTH(unicode) = length;
914 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100915 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
916 PyObject_DEL(_PyUnicode_WSTR(unicode));
917 _PyUnicode_WSTR(unicode) = NULL;
918 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200919#ifdef Py_DEBUG
920 unicode_fill_invalid(unicode, old_length);
921#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200922 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
923 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200924 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200925 return unicode;
926}
927
Alexander Belopolsky40018472011-02-26 01:02:56 +0000928static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200929resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930{
Victor Stinner95663112011-10-04 01:03:50 +0200931 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100932 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200933 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200934 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000935
Victor Stinnerfe226c02011-10-03 03:52:20 +0200936 if (PyUnicode_IS_READY(unicode)) {
937 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200938 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200940#ifdef Py_DEBUG
941 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
942#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943
944 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200945 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200946 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
947 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200948
949 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
950 PyErr_NoMemory();
951 return -1;
952 }
953 new_size = (length + 1) * char_size;
954
Victor Stinner7a9105a2011-12-12 00:13:42 +0100955 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
956 {
957 PyObject_DEL(_PyUnicode_UTF8(unicode));
958 _PyUnicode_UTF8(unicode) = NULL;
959 _PyUnicode_UTF8_LENGTH(unicode) = 0;
960 }
961
Victor Stinnerfe226c02011-10-03 03:52:20 +0200962 data = (PyObject *)PyObject_REALLOC(data, new_size);
963 if (data == NULL) {
964 PyErr_NoMemory();
965 return -1;
966 }
967 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200968 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200969 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200970 _PyUnicode_WSTR_LENGTH(unicode) = length;
971 }
972 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200973 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200974 _PyUnicode_UTF8_LENGTH(unicode) = length;
975 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200976 _PyUnicode_LENGTH(unicode) = length;
977 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200978#ifdef Py_DEBUG
979 unicode_fill_invalid(unicode, old_length);
980#endif
Victor Stinner95663112011-10-04 01:03:50 +0200981 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200982 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200983 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200984 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200985 }
Victor Stinner95663112011-10-04 01:03:50 +0200986 assert(_PyUnicode_WSTR(unicode) != NULL);
987
988 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700989 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200990 PyErr_NoMemory();
991 return -1;
992 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100993 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200994 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100995 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200996 if (!wstr) {
997 PyErr_NoMemory();
998 return -1;
999 }
1000 _PyUnicode_WSTR(unicode) = wstr;
1001 _PyUnicode_WSTR(unicode)[length] = 0;
1002 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001003 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004 return 0;
1005}
1006
Victor Stinnerfe226c02011-10-03 03:52:20 +02001007static PyObject*
1008resize_copy(PyObject *unicode, Py_ssize_t length)
1009{
1010 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001011 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001012 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001013
Benjamin Petersonbac79492012-01-14 13:34:47 -05001014 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001015 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001016
1017 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1018 if (copy == NULL)
1019 return NULL;
1020
1021 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001022 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001023 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001024 }
1025 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001026 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001028 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (w == NULL)
1030 return NULL;
1031 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1032 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001033 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1034 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001035 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 }
1037}
1038
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001040 Ux0000 terminated; some code (e.g. new_identifier)
1041 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001044 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045
1046*/
1047
Alexander Belopolsky40018472011-02-26 01:02:56 +00001048static PyUnicodeObject *
1049_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001051 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
Thomas Wouters477c8d52006-05-27 19:21:47 +00001054 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 if (length == 0 && unicode_empty != NULL) {
1056 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001057 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 }
1059
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001060 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001061 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001062 return (PyUnicodeObject *)PyErr_NoMemory();
1063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 if (length < 0) {
1065 PyErr_SetString(PyExc_SystemError,
1066 "Negative size passed to _PyUnicode_New");
1067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 }
1069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1071 if (unicode == NULL)
1072 return NULL;
1073 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001074
1075 _PyUnicode_WSTR_LENGTH(unicode) = length;
1076 _PyUnicode_HASH(unicode) = -1;
1077 _PyUnicode_STATE(unicode).interned = 0;
1078 _PyUnicode_STATE(unicode).kind = 0;
1079 _PyUnicode_STATE(unicode).compact = 0;
1080 _PyUnicode_STATE(unicode).ready = 0;
1081 _PyUnicode_STATE(unicode).ascii = 0;
1082 _PyUnicode_DATA_ANY(unicode) = NULL;
1083 _PyUnicode_LENGTH(unicode) = 0;
1084 _PyUnicode_UTF8(unicode) = NULL;
1085 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1088 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001089 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001090 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001091 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093
Jeremy Hyltond8082792003-09-16 19:41:39 +00001094 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001095 * the caller fails before initializing str -- unicode_resize()
1096 * reads str[0], and the Keep-Alive optimization can keep memory
1097 * allocated for str alive across a call to unicode_dealloc(unicode).
1098 * We don't want unicode_resize to read uninitialized memory in
1099 * that case.
1100 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 _PyUnicode_WSTR(unicode)[0] = 0;
1102 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001103
Victor Stinner7931d9a2011-11-04 00:22:48 +01001104 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105 return unicode;
1106}
1107
Victor Stinnerf42dc442011-10-02 23:33:16 +02001108static const char*
1109unicode_kind_name(PyObject *unicode)
1110{
Victor Stinner42dfd712011-10-03 14:41:45 +02001111 /* don't check consistency: unicode_kind_name() is called from
1112 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001113 if (!PyUnicode_IS_COMPACT(unicode))
1114 {
1115 if (!PyUnicode_IS_READY(unicode))
1116 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001117 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001118 {
1119 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001120 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001121 return "legacy ascii";
1122 else
1123 return "legacy latin1";
1124 case PyUnicode_2BYTE_KIND:
1125 return "legacy UCS2";
1126 case PyUnicode_4BYTE_KIND:
1127 return "legacy UCS4";
1128 default:
1129 return "<legacy invalid kind>";
1130 }
1131 }
1132 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001133 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001134 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001135 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001136 return "ascii";
1137 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001138 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001139 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001140 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001142 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001143 default:
1144 return "<invalid compact kind>";
1145 }
1146}
1147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149/* Functions wrapping macros for use in debugger */
1150char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001151 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152}
1153
1154void *_PyUnicode_compact_data(void *unicode) {
1155 return _PyUnicode_COMPACT_DATA(unicode);
1156}
1157void *_PyUnicode_data(void *unicode){
1158 printf("obj %p\n", unicode);
1159 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1160 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1161 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1162 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1163 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1164 return PyUnicode_DATA(unicode);
1165}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001166
1167void
1168_PyUnicode_Dump(PyObject *op)
1169{
1170 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001171 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1172 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1173 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001174
Victor Stinnera849a4b2011-10-03 12:12:11 +02001175 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001176 {
1177 if (ascii->state.ascii)
1178 data = (ascii + 1);
1179 else
1180 data = (compact + 1);
1181 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001182 else
1183 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001184 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1185 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001186
Victor Stinnera849a4b2011-10-03 12:12:11 +02001187 if (ascii->wstr == data)
1188 printf("shared ");
1189 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001190
Victor Stinnera3b334d2011-10-03 13:53:37 +02001191 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001192 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001193 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1194 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001195 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1196 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001197 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001198 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200#endif
1201
1202PyObject *
1203PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1204{
1205 PyObject *obj;
1206 PyCompactUnicodeObject *unicode;
1207 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001208 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001209 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001210 Py_ssize_t char_size;
1211 Py_ssize_t struct_size;
1212
1213 /* Optimization for empty strings */
1214 if (size == 0 && unicode_empty != NULL) {
1215 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001216 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 }
1218
Victor Stinner9e9d6892011-10-04 01:02:02 +02001219 is_ascii = 0;
1220 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221 struct_size = sizeof(PyCompactUnicodeObject);
1222 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001223 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 char_size = 1;
1225 is_ascii = 1;
1226 struct_size = sizeof(PyASCIIObject);
1227 }
1228 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001229 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 char_size = 1;
1231 }
1232 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001233 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 char_size = 2;
1235 if (sizeof(wchar_t) == 2)
1236 is_sharing = 1;
1237 }
1238 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001239 if (maxchar > MAX_UNICODE) {
1240 PyErr_SetString(PyExc_SystemError,
1241 "invalid maximum character passed to PyUnicode_New");
1242 return NULL;
1243 }
Victor Stinner8f825062012-04-27 13:55:39 +02001244 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 char_size = 4;
1246 if (sizeof(wchar_t) == 4)
1247 is_sharing = 1;
1248 }
1249
1250 /* Ensure we won't overflow the size. */
1251 if (size < 0) {
1252 PyErr_SetString(PyExc_SystemError,
1253 "Negative size passed to PyUnicode_New");
1254 return NULL;
1255 }
1256 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1257 return PyErr_NoMemory();
1258
1259 /* Duplicated allocation code from _PyObject_New() instead of a call to
1260 * PyObject_New() so we are able to allocate space for the object and
1261 * it's data buffer.
1262 */
1263 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1264 if (obj == NULL)
1265 return PyErr_NoMemory();
1266 obj = PyObject_INIT(obj, &PyUnicode_Type);
1267 if (obj == NULL)
1268 return NULL;
1269
1270 unicode = (PyCompactUnicodeObject *)obj;
1271 if (is_ascii)
1272 data = ((PyASCIIObject*)obj) + 1;
1273 else
1274 data = unicode + 1;
1275 _PyUnicode_LENGTH(unicode) = size;
1276 _PyUnicode_HASH(unicode) = -1;
1277 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001278 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 _PyUnicode_STATE(unicode).compact = 1;
1280 _PyUnicode_STATE(unicode).ready = 1;
1281 _PyUnicode_STATE(unicode).ascii = is_ascii;
1282 if (is_ascii) {
1283 ((char*)data)[size] = 0;
1284 _PyUnicode_WSTR(unicode) = NULL;
1285 }
Victor Stinner8f825062012-04-27 13:55:39 +02001286 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287 ((char*)data)[size] = 0;
1288 _PyUnicode_WSTR(unicode) = NULL;
1289 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001291 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 else {
1294 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001295 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001296 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001298 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299 ((Py_UCS4*)data)[size] = 0;
1300 if (is_sharing) {
1301 _PyUnicode_WSTR_LENGTH(unicode) = size;
1302 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1303 }
1304 else {
1305 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1306 _PyUnicode_WSTR(unicode) = NULL;
1307 }
1308 }
Victor Stinner8f825062012-04-27 13:55:39 +02001309#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001310 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001311#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001312 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 return obj;
1314}
1315
1316#if SIZEOF_WCHAR_T == 2
1317/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1318 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001319 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320
1321 This function assumes that unicode can hold one more code point than wstr
1322 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001323static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001325 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326{
1327 const wchar_t *iter;
1328 Py_UCS4 *ucs4_out;
1329
Victor Stinner910337b2011-10-03 03:20:16 +02001330 assert(unicode != NULL);
1331 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1333 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1334
1335 for (iter = begin; iter < end; ) {
1336 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1337 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001338 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1339 && (iter+1) < end
1340 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 {
Victor Stinner551ac952011-11-29 22:58:13 +01001342 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 iter += 2;
1344 }
1345 else {
1346 *ucs4_out++ = *iter;
1347 iter++;
1348 }
1349 }
1350 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1351 _PyUnicode_GET_LENGTH(unicode)));
1352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353}
1354#endif
1355
Victor Stinnercd9950f2011-10-02 00:34:53 +02001356static int
Victor Stinner488fa492011-12-12 00:01:39 +01001357unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001358{
Victor Stinner488fa492011-12-12 00:01:39 +01001359 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001360 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001361 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001362 return -1;
1363 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001364 return 0;
1365}
1366
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001367static int
1368_copy_characters(PyObject *to, Py_ssize_t to_start,
1369 PyObject *from, Py_ssize_t from_start,
1370 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001372 unsigned int from_kind, to_kind;
1373 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374
Victor Stinneree4544c2012-05-09 22:24:08 +02001375 assert(0 <= how_many);
1376 assert(0 <= from_start);
1377 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001378 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001379 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001380 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381
Victor Stinnerd3f08822012-05-29 12:57:52 +02001382 assert(PyUnicode_Check(to));
1383 assert(PyUnicode_IS_READY(to));
1384 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1385
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001386 if (how_many == 0)
1387 return 0;
1388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001390 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001392 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393
Victor Stinnerf1852262012-06-16 16:38:26 +02001394#ifdef Py_DEBUG
1395 if (!check_maxchar
1396 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1397 {
1398 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1399 Py_UCS4 ch;
1400 Py_ssize_t i;
1401 for (i=0; i < how_many; i++) {
1402 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1403 assert(ch <= to_maxchar);
1404 }
1405 }
1406#endif
1407
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001408 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001409 if (check_maxchar
1410 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1411 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001412 /* Writing Latin-1 characters into an ASCII string requires to
1413 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001414 Py_UCS4 max_char;
1415 max_char = ucs1lib_find_max_char(from_data,
1416 (Py_UCS1*)from_data + how_many);
1417 if (max_char >= 128)
1418 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001420 Py_MEMCPY((char*)to_data + to_kind * to_start,
1421 (char*)from_data + from_kind * from_start,
1422 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001424 else if (from_kind == PyUnicode_1BYTE_KIND
1425 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001426 {
1427 _PyUnicode_CONVERT_BYTES(
1428 Py_UCS1, Py_UCS2,
1429 PyUnicode_1BYTE_DATA(from) + from_start,
1430 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1431 PyUnicode_2BYTE_DATA(to) + to_start
1432 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001433 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001434 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001435 && to_kind == PyUnicode_4BYTE_KIND)
1436 {
1437 _PyUnicode_CONVERT_BYTES(
1438 Py_UCS1, Py_UCS4,
1439 PyUnicode_1BYTE_DATA(from) + from_start,
1440 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1441 PyUnicode_4BYTE_DATA(to) + to_start
1442 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 }
1444 else if (from_kind == PyUnicode_2BYTE_KIND
1445 && to_kind == PyUnicode_4BYTE_KIND)
1446 {
1447 _PyUnicode_CONVERT_BYTES(
1448 Py_UCS2, Py_UCS4,
1449 PyUnicode_2BYTE_DATA(from) + from_start,
1450 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1451 PyUnicode_4BYTE_DATA(to) + to_start
1452 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001453 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001454 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001455 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1456
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001457 if (!check_maxchar) {
1458 if (from_kind == PyUnicode_2BYTE_KIND
1459 && to_kind == PyUnicode_1BYTE_KIND)
1460 {
1461 _PyUnicode_CONVERT_BYTES(
1462 Py_UCS2, Py_UCS1,
1463 PyUnicode_2BYTE_DATA(from) + from_start,
1464 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1465 PyUnicode_1BYTE_DATA(to) + to_start
1466 );
1467 }
1468 else if (from_kind == PyUnicode_4BYTE_KIND
1469 && to_kind == PyUnicode_1BYTE_KIND)
1470 {
1471 _PyUnicode_CONVERT_BYTES(
1472 Py_UCS4, Py_UCS1,
1473 PyUnicode_4BYTE_DATA(from) + from_start,
1474 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1475 PyUnicode_1BYTE_DATA(to) + to_start
1476 );
1477 }
1478 else if (from_kind == PyUnicode_4BYTE_KIND
1479 && to_kind == PyUnicode_2BYTE_KIND)
1480 {
1481 _PyUnicode_CONVERT_BYTES(
1482 Py_UCS4, Py_UCS2,
1483 PyUnicode_4BYTE_DATA(from) + from_start,
1484 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1485 PyUnicode_2BYTE_DATA(to) + to_start
1486 );
1487 }
1488 else {
1489 assert(0);
1490 return -1;
1491 }
1492 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001493 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001494 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001495 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001496 Py_ssize_t i;
1497
Victor Stinnera0702ab2011-09-29 14:14:38 +02001498 for (i=0; i < how_many; i++) {
1499 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001500 if (ch > to_maxchar)
1501 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001502 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1503 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001504 }
1505 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001506 return 0;
1507}
1508
Victor Stinnerd3f08822012-05-29 12:57:52 +02001509void
1510_PyUnicode_FastCopyCharacters(
1511 PyObject *to, Py_ssize_t to_start,
1512 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513{
1514 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1515}
1516
1517Py_ssize_t
1518PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1519 PyObject *from, Py_ssize_t from_start,
1520 Py_ssize_t how_many)
1521{
1522 int err;
1523
1524 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1525 PyErr_BadInternalCall();
1526 return -1;
1527 }
1528
Benjamin Petersonbac79492012-01-14 13:34:47 -05001529 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001531 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001532 return -1;
1533
Victor Stinnerd3f08822012-05-29 12:57:52 +02001534 if (from_start < 0) {
1535 PyErr_SetString(PyExc_IndexError, "string index out of range");
1536 return -1;
1537 }
1538 if (to_start < 0) {
1539 PyErr_SetString(PyExc_IndexError, "string index out of range");
1540 return -1;
1541 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001542 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1543 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1544 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001545 "Cannot write %zi characters at %zi "
1546 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 how_many, to_start, PyUnicode_GET_LENGTH(to));
1548 return -1;
1549 }
1550
1551 if (how_many == 0)
1552 return 0;
1553
Victor Stinner488fa492011-12-12 00:01:39 +01001554 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001555 return -1;
1556
1557 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1558 if (err) {
1559 PyErr_Format(PyExc_SystemError,
1560 "Cannot copy %s characters "
1561 "into a string of %s characters",
1562 unicode_kind_name(from),
1563 unicode_kind_name(to));
1564 return -1;
1565 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001566 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567}
1568
Victor Stinner17222162011-09-28 22:15:37 +02001569/* Find the maximum code point and count the number of surrogate pairs so a
1570 correct string length can be computed before converting a string to UCS4.
1571 This function counts single surrogates as a character and not as a pair.
1572
1573 Return 0 on success, or -1 on error. */
1574static int
1575find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1576 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577{
1578 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001579 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580
Victor Stinnerc53be962011-10-02 21:33:54 +02001581 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 *num_surrogates = 0;
1583 *maxchar = 0;
1584
1585 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001587 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1588 && (iter+1) < end
1589 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1590 {
1591 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1592 ++(*num_surrogates);
1593 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 }
1595 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001596#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001597 {
1598 ch = *iter;
1599 iter++;
1600 }
1601 if (ch > *maxchar) {
1602 *maxchar = ch;
1603 if (*maxchar > MAX_UNICODE) {
1604 PyErr_Format(PyExc_ValueError,
1605 "character U+%x is not in range [U+0000; U+10ffff]",
1606 ch);
1607 return -1;
1608 }
1609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 }
1611 return 0;
1612}
1613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001614int
1615_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616{
1617 wchar_t *end;
1618 Py_UCS4 maxchar = 0;
1619 Py_ssize_t num_surrogates;
1620#if SIZEOF_WCHAR_T == 2
1621 Py_ssize_t length_wo_surrogates;
1622#endif
1623
Georg Brandl7597add2011-10-05 16:36:47 +02001624 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001625 strings were created using _PyObject_New() and where no canonical
1626 representation (the str field) has been set yet aka strings
1627 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001628 assert(_PyUnicode_CHECK(unicode));
1629 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001631 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001632 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001633 /* Actually, it should neither be interned nor be anything else: */
1634 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001637 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001638 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640
1641 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001642 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1643 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 PyErr_NoMemory();
1645 return -1;
1646 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001647 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 _PyUnicode_WSTR(unicode), end,
1649 PyUnicode_1BYTE_DATA(unicode));
1650 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1651 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1652 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1653 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001654 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001655 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001656 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 }
1658 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001659 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001660 _PyUnicode_UTF8(unicode) = NULL;
1661 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 }
1663 PyObject_FREE(_PyUnicode_WSTR(unicode));
1664 _PyUnicode_WSTR(unicode) = NULL;
1665 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1666 }
1667 /* In this case we might have to convert down from 4-byte native
1668 wchar_t to 2-byte unicode. */
1669 else if (maxchar < 65536) {
1670 assert(num_surrogates == 0 &&
1671 "FindMaxCharAndNumSurrogatePairs() messed up");
1672
Victor Stinner506f5922011-09-28 22:34:18 +02001673#if SIZEOF_WCHAR_T == 2
1674 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001675 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001676 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1677 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1678 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001679 _PyUnicode_UTF8(unicode) = NULL;
1680 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001681#else
1682 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001683 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001684 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001685 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001686 PyErr_NoMemory();
1687 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 }
Victor Stinner506f5922011-09-28 22:34:18 +02001689 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1690 _PyUnicode_WSTR(unicode), end,
1691 PyUnicode_2BYTE_DATA(unicode));
1692 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1693 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1694 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001695 _PyUnicode_UTF8(unicode) = NULL;
1696 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001697 PyObject_FREE(_PyUnicode_WSTR(unicode));
1698 _PyUnicode_WSTR(unicode) = NULL;
1699 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1700#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701 }
1702 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1703 else {
1704#if SIZEOF_WCHAR_T == 2
1705 /* in case the native representation is 2-bytes, we need to allocate a
1706 new normalized 4-byte version. */
1707 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001708 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1709 PyErr_NoMemory();
1710 return -1;
1711 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001712 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1713 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 PyErr_NoMemory();
1715 return -1;
1716 }
1717 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1718 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001719 _PyUnicode_UTF8(unicode) = NULL;
1720 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001721 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1722 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001723 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 PyObject_FREE(_PyUnicode_WSTR(unicode));
1725 _PyUnicode_WSTR(unicode) = NULL;
1726 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1727#else
1728 assert(num_surrogates == 0);
1729
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001732 _PyUnicode_UTF8(unicode) = NULL;
1733 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1735#endif
1736 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1737 }
1738 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001739 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 return 0;
1741}
1742
Alexander Belopolsky40018472011-02-26 01:02:56 +00001743static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001744unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745{
Walter Dörwald16807132007-05-25 13:52:07 +00001746 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001747 case SSTATE_NOT_INTERNED:
1748 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001749
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 case SSTATE_INTERNED_MORTAL:
1751 /* revive dead object temporarily for DelItem */
1752 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001753 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001754 Py_FatalError(
1755 "deletion of interned string failed");
1756 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001757
Benjamin Peterson29060642009-01-31 22:14:21 +00001758 case SSTATE_INTERNED_IMMORTAL:
1759 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001760
Benjamin Peterson29060642009-01-31 22:14:21 +00001761 default:
1762 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001763 }
1764
Victor Stinner03490912011-10-03 23:45:12 +02001765 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001767 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001768 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001769 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1770 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001772 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773}
1774
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001775#ifdef Py_DEBUG
1776static int
1777unicode_is_singleton(PyObject *unicode)
1778{
1779 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1780 if (unicode == unicode_empty)
1781 return 1;
1782 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1783 {
1784 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1785 if (ch < 256 && unicode_latin1[ch] == unicode)
1786 return 1;
1787 }
1788 return 0;
1789}
1790#endif
1791
Alexander Belopolsky40018472011-02-26 01:02:56 +00001792static int
Victor Stinner488fa492011-12-12 00:01:39 +01001793unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001794{
Victor Stinner488fa492011-12-12 00:01:39 +01001795 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001796 if (Py_REFCNT(unicode) != 1)
1797 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001798 if (_PyUnicode_HASH(unicode) != -1)
1799 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001800 if (PyUnicode_CHECK_INTERNED(unicode))
1801 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001802 if (!PyUnicode_CheckExact(unicode))
1803 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001804#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001805 /* singleton refcount is greater than 1 */
1806 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001807#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001808 return 1;
1809}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810
Victor Stinnerfe226c02011-10-03 03:52:20 +02001811static int
1812unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1813{
1814 PyObject *unicode;
1815 Py_ssize_t old_length;
1816
1817 assert(p_unicode != NULL);
1818 unicode = *p_unicode;
1819
1820 assert(unicode != NULL);
1821 assert(PyUnicode_Check(unicode));
1822 assert(0 <= length);
1823
Victor Stinner910337b2011-10-03 03:20:16 +02001824 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001825 old_length = PyUnicode_WSTR_LENGTH(unicode);
1826 else
1827 old_length = PyUnicode_GET_LENGTH(unicode);
1828 if (old_length == length)
1829 return 0;
1830
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001831 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001832 _Py_INCREF_UNICODE_EMPTY();
1833 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001834 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001835 Py_DECREF(*p_unicode);
1836 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001837 return 0;
1838 }
1839
Victor Stinner488fa492011-12-12 00:01:39 +01001840 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001841 PyObject *copy = resize_copy(unicode, length);
1842 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001843 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001844 Py_DECREF(*p_unicode);
1845 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847 }
1848
Victor Stinnerfe226c02011-10-03 03:52:20 +02001849 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001850 PyObject *new_unicode = resize_compact(unicode, length);
1851 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001852 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001853 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001854 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001855 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001856 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001857}
1858
Alexander Belopolsky40018472011-02-26 01:02:56 +00001859int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001860PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001861{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 PyObject *unicode;
1863 if (p_unicode == NULL) {
1864 PyErr_BadInternalCall();
1865 return -1;
1866 }
1867 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 {
1870 PyErr_BadInternalCall();
1871 return -1;
1872 }
1873 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001874}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001875
Victor Stinnerc5166102012-02-22 13:55:02 +01001876/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001877
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001878 WARNING: The function doesn't copy the terminating null character and
1879 doesn't check the maximum character (may write a latin1 character in an
1880 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001881static void
1882unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1883 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001884{
1885 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1886 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001887 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001888
1889 switch (kind) {
1890 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001891 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001892#ifdef Py_DEBUG
1893 if (PyUnicode_IS_ASCII(unicode)) {
1894 Py_UCS4 maxchar = ucs1lib_find_max_char(
1895 (const Py_UCS1*)str,
1896 (const Py_UCS1*)str + len);
1897 assert(maxchar < 128);
1898 }
1899#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001900 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001901 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001902 }
1903 case PyUnicode_2BYTE_KIND: {
1904 Py_UCS2 *start = (Py_UCS2 *)data + index;
1905 Py_UCS2 *ucs2 = start;
1906 assert(index <= PyUnicode_GET_LENGTH(unicode));
1907
Victor Stinner184252a2012-06-16 02:57:41 +02001908 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001909 *ucs2 = (Py_UCS2)*str;
1910
1911 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001912 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001913 }
1914 default: {
1915 Py_UCS4 *start = (Py_UCS4 *)data + index;
1916 Py_UCS4 *ucs4 = start;
1917 assert(kind == PyUnicode_4BYTE_KIND);
1918 assert(index <= PyUnicode_GET_LENGTH(unicode));
1919
Victor Stinner184252a2012-06-16 02:57:41 +02001920 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001921 *ucs4 = (Py_UCS4)*str;
1922
1923 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001924 }
1925 }
1926}
1927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001928static PyObject*
1929get_latin1_char(unsigned char ch)
1930{
Victor Stinnera464fc12011-10-02 20:39:30 +02001931 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001933 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001934 if (!unicode)
1935 return NULL;
1936 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001937 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 unicode_latin1[ch] = unicode;
1939 }
1940 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001941 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942}
1943
Victor Stinner985a82a2014-01-03 12:53:47 +01001944static PyObject*
1945unicode_char(Py_UCS4 ch)
1946{
1947 PyObject *unicode;
1948
1949 assert(ch <= MAX_UNICODE);
1950
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001951 if (ch < 256)
1952 return get_latin1_char(ch);
1953
Victor Stinner985a82a2014-01-03 12:53:47 +01001954 unicode = PyUnicode_New(1, ch);
1955 if (unicode == NULL)
1956 return NULL;
1957 switch (PyUnicode_KIND(unicode)) {
1958 case PyUnicode_1BYTE_KIND:
1959 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1960 break;
1961 case PyUnicode_2BYTE_KIND:
1962 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1963 break;
1964 default:
1965 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1966 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1967 }
1968 assert(_PyUnicode_CheckConsistency(unicode, 1));
1969 return unicode;
1970}
1971
Alexander Belopolsky40018472011-02-26 01:02:56 +00001972PyObject *
1973PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001975 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 Py_UCS4 maxchar = 0;
1977 Py_ssize_t num_surrogates;
1978
1979 if (u == NULL)
1980 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001982 /* If the Unicode data is known at construction time, we can apply
1983 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001986 if (size == 0)
1987 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 /* Single character Unicode objects in the Latin-1 range are
1990 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001991 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 return get_latin1_char((unsigned char)*u);
1993
1994 /* If not empty and not single character, copy the Unicode data
1995 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001996 if (find_maxchar_surrogates(u, u + size,
1997 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 return NULL;
1999
Victor Stinner8faf8212011-12-08 22:14:11 +01002000 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 if (!unicode)
2002 return NULL;
2003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 switch (PyUnicode_KIND(unicode)) {
2005 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002006 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2008 break;
2009 case PyUnicode_2BYTE_KIND:
2010#if Py_UNICODE_SIZE == 2
2011 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2012#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002013 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2015#endif
2016 break;
2017 case PyUnicode_4BYTE_KIND:
2018#if SIZEOF_WCHAR_T == 2
2019 /* This is the only case which has to process surrogates, thus
2020 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002021 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022#else
2023 assert(num_surrogates == 0);
2024 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2025#endif
2026 break;
2027 default:
2028 assert(0 && "Impossible state");
2029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002031 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032}
2033
Alexander Belopolsky40018472011-02-26 01:02:56 +00002034PyObject *
2035PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002036{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002037 if (size < 0) {
2038 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002039 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002040 return NULL;
2041 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002042 if (u != NULL)
2043 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2044 else
2045 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002046}
2047
Alexander Belopolsky40018472011-02-26 01:02:56 +00002048PyObject *
2049PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002050{
2051 size_t size = strlen(u);
2052 if (size > PY_SSIZE_T_MAX) {
2053 PyErr_SetString(PyExc_OverflowError, "input too long");
2054 return NULL;
2055 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002056 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002057}
2058
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002059PyObject *
2060_PyUnicode_FromId(_Py_Identifier *id)
2061{
2062 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002063 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2064 strlen(id->string),
2065 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002066 if (!id->object)
2067 return NULL;
2068 PyUnicode_InternInPlace(&id->object);
2069 assert(!id->next);
2070 id->next = static_strings;
2071 static_strings = id;
2072 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002073 return id->object;
2074}
2075
2076void
2077_PyUnicode_ClearStaticStrings()
2078{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002079 _Py_Identifier *tmp, *s = static_strings;
2080 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002081 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002082 tmp = s->next;
2083 s->next = NULL;
2084 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002085 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002086 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002087}
2088
Benjamin Peterson0df54292012-03-26 14:50:32 -04002089/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002090
Victor Stinnerd3f08822012-05-29 12:57:52 +02002091PyObject*
2092_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002093{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002094 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002095 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002096 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002097#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002098 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002099#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002100 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002101 }
Victor Stinner785938e2011-12-11 20:09:03 +01002102 unicode = PyUnicode_New(size, 127);
2103 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002104 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002105 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2106 assert(_PyUnicode_CheckConsistency(unicode, 1));
2107 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002108}
2109
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002110static Py_UCS4
2111kind_maxchar_limit(unsigned int kind)
2112{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002113 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002114 case PyUnicode_1BYTE_KIND:
2115 return 0x80;
2116 case PyUnicode_2BYTE_KIND:
2117 return 0x100;
2118 case PyUnicode_4BYTE_KIND:
2119 return 0x10000;
2120 default:
2121 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002122 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002123 }
2124}
2125
Victor Stinnere6abb482012-05-02 01:15:40 +02002126Py_LOCAL_INLINE(Py_UCS4)
2127align_maxchar(Py_UCS4 maxchar)
2128{
2129 if (maxchar <= 127)
2130 return 127;
2131 else if (maxchar <= 255)
2132 return 255;
2133 else if (maxchar <= 65535)
2134 return 65535;
2135 else
2136 return MAX_UNICODE;
2137}
2138
Victor Stinner702c7342011-10-05 13:50:52 +02002139static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002140_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002143 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002144
Serhiy Storchaka678db842013-01-26 12:16:36 +02002145 if (size == 0)
2146 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002147 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002148 if (size == 1)
2149 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002150
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002151 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002152 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 if (!res)
2154 return NULL;
2155 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002156 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002158}
2159
Victor Stinnere57b1c02011-09-28 22:20:48 +02002160static PyObject*
2161_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162{
2163 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002164 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002165
Serhiy Storchaka678db842013-01-26 12:16:36 +02002166 if (size == 0)
2167 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002168 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002169 if (size == 1)
2170 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002171
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002172 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002173 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 if (!res)
2175 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002176 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002178 else {
2179 _PyUnicode_CONVERT_BYTES(
2180 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2181 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002182 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return res;
2184}
2185
Victor Stinnere57b1c02011-09-28 22:20:48 +02002186static PyObject*
2187_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188{
2189 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002190 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191
Serhiy Storchaka678db842013-01-26 12:16:36 +02002192 if (size == 0)
2193 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002194 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002195 if (size == 1)
2196 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002197
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002198 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002199 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 if (!res)
2201 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002202 if (max_char < 256)
2203 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2204 PyUnicode_1BYTE_DATA(res));
2205 else if (max_char < 0x10000)
2206 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2207 PyUnicode_2BYTE_DATA(res));
2208 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002210 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 return res;
2212}
2213
2214PyObject*
2215PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2216{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002217 if (size < 0) {
2218 PyErr_SetString(PyExc_ValueError, "size must be positive");
2219 return NULL;
2220 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002221 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002223 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002225 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002227 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002228 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002229 PyErr_SetString(PyExc_SystemError, "invalid kind");
2230 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232}
2233
Victor Stinnerece58de2012-04-23 23:36:38 +02002234Py_UCS4
2235_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2236{
2237 enum PyUnicode_Kind kind;
2238 void *startptr, *endptr;
2239
2240 assert(PyUnicode_IS_READY(unicode));
2241 assert(0 <= start);
2242 assert(end <= PyUnicode_GET_LENGTH(unicode));
2243 assert(start <= end);
2244
2245 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2246 return PyUnicode_MAX_CHAR_VALUE(unicode);
2247
2248 if (start == end)
2249 return 127;
2250
Victor Stinner94d558b2012-04-27 22:26:58 +02002251 if (PyUnicode_IS_ASCII(unicode))
2252 return 127;
2253
Victor Stinnerece58de2012-04-23 23:36:38 +02002254 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002255 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002256 endptr = (char *)startptr + end * kind;
2257 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002258 switch(kind) {
2259 case PyUnicode_1BYTE_KIND:
2260 return ucs1lib_find_max_char(startptr, endptr);
2261 case PyUnicode_2BYTE_KIND:
2262 return ucs2lib_find_max_char(startptr, endptr);
2263 case PyUnicode_4BYTE_KIND:
2264 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002265 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002266 assert(0);
2267 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002268 }
2269}
2270
Victor Stinner25a4b292011-10-06 12:31:55 +02002271/* Ensure that a string uses the most efficient storage, if it is not the
2272 case: create a new string with of the right kind. Write NULL into *p_unicode
2273 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002274static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002275unicode_adjust_maxchar(PyObject **p_unicode)
2276{
2277 PyObject *unicode, *copy;
2278 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002279 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002280 unsigned int kind;
2281
2282 assert(p_unicode != NULL);
2283 unicode = *p_unicode;
2284 assert(PyUnicode_IS_READY(unicode));
2285 if (PyUnicode_IS_ASCII(unicode))
2286 return;
2287
2288 len = PyUnicode_GET_LENGTH(unicode);
2289 kind = PyUnicode_KIND(unicode);
2290 if (kind == PyUnicode_1BYTE_KIND) {
2291 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002292 max_char = ucs1lib_find_max_char(u, u + len);
2293 if (max_char >= 128)
2294 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002295 }
2296 else if (kind == PyUnicode_2BYTE_KIND) {
2297 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002298 max_char = ucs2lib_find_max_char(u, u + len);
2299 if (max_char >= 256)
2300 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002301 }
2302 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002303 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002304 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002305 max_char = ucs4lib_find_max_char(u, u + len);
2306 if (max_char >= 0x10000)
2307 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002308 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002309 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002310 if (copy != NULL)
2311 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002312 Py_DECREF(unicode);
2313 *p_unicode = copy;
2314}
2315
Victor Stinner034f6cf2011-09-30 02:26:44 +02002316PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002317_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002318{
Victor Stinner87af4f22011-11-21 23:03:47 +01002319 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002320 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002321
Victor Stinner034f6cf2011-09-30 02:26:44 +02002322 if (!PyUnicode_Check(unicode)) {
2323 PyErr_BadInternalCall();
2324 return NULL;
2325 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002326 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002327 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002328
Victor Stinner87af4f22011-11-21 23:03:47 +01002329 length = PyUnicode_GET_LENGTH(unicode);
2330 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002331 if (!copy)
2332 return NULL;
2333 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2334
Victor Stinner87af4f22011-11-21 23:03:47 +01002335 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2336 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002337 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002338 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002339}
2340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341
Victor Stinnerbc603d12011-10-02 01:00:40 +02002342/* Widen Unicode objects to larger buffers. Don't write terminating null
2343 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344
2345void*
2346_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2347{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002348 Py_ssize_t len;
2349 void *result;
2350 unsigned int skind;
2351
Benjamin Petersonbac79492012-01-14 13:34:47 -05002352 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002353 return NULL;
2354
2355 len = PyUnicode_GET_LENGTH(s);
2356 skind = PyUnicode_KIND(s);
2357 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002358 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 return NULL;
2360 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002361 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002362 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002363 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002364 if (!result)
2365 return PyErr_NoMemory();
2366 assert(skind == PyUnicode_1BYTE_KIND);
2367 _PyUnicode_CONVERT_BYTES(
2368 Py_UCS1, Py_UCS2,
2369 PyUnicode_1BYTE_DATA(s),
2370 PyUnicode_1BYTE_DATA(s) + len,
2371 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002373 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002374 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002375 if (!result)
2376 return PyErr_NoMemory();
2377 if (skind == PyUnicode_2BYTE_KIND) {
2378 _PyUnicode_CONVERT_BYTES(
2379 Py_UCS2, Py_UCS4,
2380 PyUnicode_2BYTE_DATA(s),
2381 PyUnicode_2BYTE_DATA(s) + len,
2382 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002384 else {
2385 assert(skind == PyUnicode_1BYTE_KIND);
2386 _PyUnicode_CONVERT_BYTES(
2387 Py_UCS1, Py_UCS4,
2388 PyUnicode_1BYTE_DATA(s),
2389 PyUnicode_1BYTE_DATA(s) + len,
2390 result);
2391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002393 default:
2394 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 }
Victor Stinner01698042011-10-04 00:04:26 +02002396 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 return NULL;
2398}
2399
2400static Py_UCS4*
2401as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2402 int copy_null)
2403{
2404 int kind;
2405 void *data;
2406 Py_ssize_t len, targetlen;
2407 if (PyUnicode_READY(string) == -1)
2408 return NULL;
2409 kind = PyUnicode_KIND(string);
2410 data = PyUnicode_DATA(string);
2411 len = PyUnicode_GET_LENGTH(string);
2412 targetlen = len;
2413 if (copy_null)
2414 targetlen++;
2415 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002416 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 if (!target) {
2418 PyErr_NoMemory();
2419 return NULL;
2420 }
2421 }
2422 else {
2423 if (targetsize < targetlen) {
2424 PyErr_Format(PyExc_SystemError,
2425 "string is longer than the buffer");
2426 if (copy_null && 0 < targetsize)
2427 target[0] = 0;
2428 return NULL;
2429 }
2430 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002431 if (kind == PyUnicode_1BYTE_KIND) {
2432 Py_UCS1 *start = (Py_UCS1 *) data;
2433 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002435 else if (kind == PyUnicode_2BYTE_KIND) {
2436 Py_UCS2 *start = (Py_UCS2 *) data;
2437 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2438 }
2439 else {
2440 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 if (copy_null)
2444 target[len] = 0;
2445 return target;
2446}
2447
2448Py_UCS4*
2449PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2450 int copy_null)
2451{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002452 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 PyErr_BadInternalCall();
2454 return NULL;
2455 }
2456 return as_ucs4(string, target, targetsize, copy_null);
2457}
2458
2459Py_UCS4*
2460PyUnicode_AsUCS4Copy(PyObject *string)
2461{
2462 return as_ucs4(string, NULL, 0, 1);
2463}
2464
2465#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002466
Alexander Belopolsky40018472011-02-26 01:02:56 +00002467PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002468PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002470 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002471 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002472 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002473 PyErr_BadInternalCall();
2474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 }
2476
Martin v. Löwis790465f2008-04-05 20:41:37 +00002477 if (size == -1) {
2478 size = wcslen(w);
2479 }
2480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482}
2483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002485
Victor Stinner15a11362012-10-06 23:48:20 +02002486/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002487 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2488 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2489#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002490
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002491static int
2492unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2493 Py_ssize_t width, Py_ssize_t precision)
2494{
2495 Py_ssize_t length, fill, arglen;
2496 Py_UCS4 maxchar;
2497
2498 if (PyUnicode_READY(str) == -1)
2499 return -1;
2500
2501 length = PyUnicode_GET_LENGTH(str);
2502 if ((precision == -1 || precision >= length)
2503 && width <= length)
2504 return _PyUnicodeWriter_WriteStr(writer, str);
2505
2506 if (precision != -1)
2507 length = Py_MIN(precision, length);
2508
2509 arglen = Py_MAX(length, width);
2510 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2511 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2512 else
2513 maxchar = writer->maxchar;
2514
2515 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2516 return -1;
2517
2518 if (width > length) {
2519 fill = width - length;
2520 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2521 return -1;
2522 writer->pos += fill;
2523 }
2524
2525 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2526 str, 0, length);
2527 writer->pos += length;
2528 return 0;
2529}
2530
2531static int
2532unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2533 Py_ssize_t width, Py_ssize_t precision)
2534{
2535 /* UTF-8 */
2536 Py_ssize_t length;
2537 PyObject *unicode;
2538 int res;
2539
2540 length = strlen(str);
2541 if (precision != -1)
2542 length = Py_MIN(length, precision);
2543 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2544 if (unicode == NULL)
2545 return -1;
2546
2547 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2548 Py_DECREF(unicode);
2549 return res;
2550}
2551
Victor Stinner96865452011-03-01 23:44:09 +00002552static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002553unicode_fromformat_arg(_PyUnicodeWriter *writer,
2554 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002555{
Victor Stinnere215d962012-10-06 23:03:36 +02002556 const char *p;
2557 Py_ssize_t len;
2558 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002559 Py_ssize_t width;
2560 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002561 int longflag;
2562 int longlongflag;
2563 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002564 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002565
2566 p = f;
2567 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002568 zeropad = 0;
2569 if (*f == '0') {
2570 zeropad = 1;
2571 f++;
2572 }
Victor Stinner96865452011-03-01 23:44:09 +00002573
2574 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 width = -1;
2576 if (Py_ISDIGIT((unsigned)*f)) {
2577 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002578 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002579 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002580 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002581 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002582 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002583 return NULL;
2584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002586 f++;
2587 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 }
2589 precision = -1;
2590 if (*f == '.') {
2591 f++;
2592 if (Py_ISDIGIT((unsigned)*f)) {
2593 precision = (*f - '0');
2594 f++;
2595 while (Py_ISDIGIT((unsigned)*f)) {
2596 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2597 PyErr_SetString(PyExc_ValueError,
2598 "precision too big");
2599 return NULL;
2600 }
2601 precision = (precision * 10) + (*f - '0');
2602 f++;
2603 }
2604 }
Victor Stinner96865452011-03-01 23:44:09 +00002605 if (*f == '%') {
2606 /* "%.3%s" => f points to "3" */
2607 f--;
2608 }
2609 }
2610 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002611 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002612 f--;
2613 }
Victor Stinner96865452011-03-01 23:44:09 +00002614
2615 /* Handle %ld, %lu, %lld and %llu. */
2616 longflag = 0;
2617 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002618 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002619 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002620 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002621 longflag = 1;
2622 ++f;
2623 }
2624#ifdef HAVE_LONG_LONG
2625 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002626 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002627 longlongflag = 1;
2628 f += 2;
2629 }
2630#endif
2631 }
2632 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002633 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002634 size_tflag = 1;
2635 ++f;
2636 }
Victor Stinnere215d962012-10-06 23:03:36 +02002637
2638 if (f[1] == '\0')
2639 writer->overallocate = 0;
2640
2641 switch (*f) {
2642 case 'c':
2643 {
2644 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002645 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002646 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002647 "character argument not in range(0x110000)");
2648 return NULL;
2649 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002650 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002651 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002652 break;
2653 }
2654
2655 case 'i':
2656 case 'd':
2657 case 'u':
2658 case 'x':
2659 {
2660 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002661 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002662 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002663
2664 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002665 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002666 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002667 va_arg(*vargs, unsigned long));
2668#ifdef HAVE_LONG_LONG
2669 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002670 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002671 va_arg(*vargs, unsigned PY_LONG_LONG));
2672#endif
2673 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002674 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002675 va_arg(*vargs, size_t));
2676 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002677 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002678 va_arg(*vargs, unsigned int));
2679 }
2680 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002682 }
2683 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002684 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, long));
2687#ifdef HAVE_LONG_LONG
2688 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, PY_LONG_LONG));
2691#endif
2692 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, Py_ssize_t));
2695 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002697 va_arg(*vargs, int));
2698 }
2699 assert(len >= 0);
2700
Victor Stinnere215d962012-10-06 23:03:36 +02002701 if (precision < len)
2702 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002703
2704 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002705 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2706 return NULL;
2707
Victor Stinnere215d962012-10-06 23:03:36 +02002708 if (width > precision) {
2709 Py_UCS4 fillchar;
2710 fill = width - precision;
2711 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002712 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2713 return NULL;
2714 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002715 }
Victor Stinner15a11362012-10-06 23:48:20 +02002716 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002717 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002718 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2719 return NULL;
2720 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002721 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722
Victor Stinner4a587072013-11-19 12:54:53 +01002723 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2724 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002725 break;
2726 }
2727
2728 case 'p':
2729 {
2730 char number[MAX_LONG_LONG_CHARS];
2731
2732 len = sprintf(number, "%p", va_arg(*vargs, void*));
2733 assert(len >= 0);
2734
2735 /* %p is ill-defined: ensure leading 0x. */
2736 if (number[1] == 'X')
2737 number[1] = 'x';
2738 else if (number[1] != 'x') {
2739 memmove(number + 2, number,
2740 strlen(number) + 1);
2741 number[0] = '0';
2742 number[1] = 'x';
2743 len += 2;
2744 }
2745
Victor Stinner4a587072013-11-19 12:54:53 +01002746 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002747 return NULL;
2748 break;
2749 }
2750
2751 case 's':
2752 {
2753 /* UTF-8 */
2754 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002755 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002756 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002757 break;
2758 }
2759
2760 case 'U':
2761 {
2762 PyObject *obj = va_arg(*vargs, PyObject *);
2763 assert(obj && _PyUnicode_CHECK(obj));
2764
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002766 return NULL;
2767 break;
2768 }
2769
2770 case 'V':
2771 {
2772 PyObject *obj = va_arg(*vargs, PyObject *);
2773 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002774 if (obj) {
2775 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002776 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002777 return NULL;
2778 }
2779 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 assert(str != NULL);
2781 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002782 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 }
2784 break;
2785 }
2786
2787 case 'S':
2788 {
2789 PyObject *obj = va_arg(*vargs, PyObject *);
2790 PyObject *str;
2791 assert(obj);
2792 str = PyObject_Str(obj);
2793 if (!str)
2794 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 Py_DECREF(str);
2797 return NULL;
2798 }
2799 Py_DECREF(str);
2800 break;
2801 }
2802
2803 case 'R':
2804 {
2805 PyObject *obj = va_arg(*vargs, PyObject *);
2806 PyObject *repr;
2807 assert(obj);
2808 repr = PyObject_Repr(obj);
2809 if (!repr)
2810 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002812 Py_DECREF(repr);
2813 return NULL;
2814 }
2815 Py_DECREF(repr);
2816 break;
2817 }
2818
2819 case 'A':
2820 {
2821 PyObject *obj = va_arg(*vargs, PyObject *);
2822 PyObject *ascii;
2823 assert(obj);
2824 ascii = PyObject_ASCII(obj);
2825 if (!ascii)
2826 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002827 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002828 Py_DECREF(ascii);
2829 return NULL;
2830 }
2831 Py_DECREF(ascii);
2832 break;
2833 }
2834
2835 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002836 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002837 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 break;
2839
2840 default:
2841 /* if we stumble upon an unknown formatting code, copy the rest
2842 of the format string to the output string. (we cannot just
2843 skip the code, since there's no way to know what's in the
2844 argument list) */
2845 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002846 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
2848 f = p+len;
2849 return f;
2850 }
2851
2852 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002853 return f;
2854}
2855
Walter Dörwaldd2034312007-05-18 16:29:38 +00002856PyObject *
2857PyUnicode_FromFormatV(const char *format, va_list vargs)
2858{
Victor Stinnere215d962012-10-06 23:03:36 +02002859 va_list vargs2;
2860 const char *f;
2861 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002862
Victor Stinner8f674cc2013-04-17 23:02:17 +02002863 _PyUnicodeWriter_Init(&writer);
2864 writer.min_length = strlen(format) + 100;
2865 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002866
2867 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2868 Copy it to be able to pass a reference to a subfunction. */
2869 Py_VA_COPY(vargs2, vargs);
2870
2871 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002872 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002873 f = unicode_fromformat_arg(&writer, f, &vargs2);
2874 if (f == NULL)
2875 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002877 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002878 const char *p;
2879 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002880
Victor Stinnere215d962012-10-06 23:03:36 +02002881 p = f;
2882 do
2883 {
2884 if ((unsigned char)*p > 127) {
2885 PyErr_Format(PyExc_ValueError,
2886 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2887 "string, got a non-ASCII byte: 0x%02x",
2888 (unsigned char)*p);
2889 return NULL;
2890 }
2891 p++;
2892 }
2893 while (*p != '\0' && *p != '%');
2894 len = p - f;
2895
2896 if (*p == '\0')
2897 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002898
2899 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002900 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002901
2902 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002903 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002904 }
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return _PyUnicodeWriter_Finish(&writer);
2906
2907 fail:
2908 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002909 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002910}
2911
Walter Dörwaldd2034312007-05-18 16:29:38 +00002912PyObject *
2913PyUnicode_FromFormat(const char *format, ...)
2914{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002915 PyObject* ret;
2916 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002917
2918#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002920#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002921 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002922#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002923 ret = PyUnicode_FromFormatV(format, vargs);
2924 va_end(vargs);
2925 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002926}
2927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002928#ifdef HAVE_WCHAR_H
2929
Victor Stinner5593d8a2010-10-02 11:11:27 +00002930/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2931 convert a Unicode object to a wide character string.
2932
Victor Stinnerd88d9832011-09-06 02:00:05 +02002933 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002934 character) required to convert the unicode object. Ignore size argument.
2935
Victor Stinnerd88d9832011-09-06 02:00:05 +02002936 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002937 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002938 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002939static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002940unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002941 wchar_t *w,
2942 Py_ssize_t size)
2943{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002944 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945 const wchar_t *wstr;
2946
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002947 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 if (wstr == NULL)
2949 return -1;
2950
Victor Stinner5593d8a2010-10-02 11:11:27 +00002951 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 if (size > res)
2953 size = res + 1;
2954 else
2955 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002957 return res;
2958 }
2959 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002961}
2962
2963Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002964PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002965 wchar_t *w,
2966 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967{
2968 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 PyErr_BadInternalCall();
2970 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002972 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973}
2974
Victor Stinner137c34c2010-09-29 10:25:54 +00002975wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002976PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002977 Py_ssize_t *size)
2978{
2979 wchar_t* buffer;
2980 Py_ssize_t buflen;
2981
2982 if (unicode == NULL) {
2983 PyErr_BadInternalCall();
2984 return NULL;
2985 }
2986
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002988 if (buflen == -1)
2989 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002990 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002991 if (buffer == NULL) {
2992 PyErr_NoMemory();
2993 return NULL;
2994 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002995 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002996 if (buflen == -1) {
2997 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002998 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002999 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003000 if (size != NULL)
3001 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003002 return buffer;
3003}
3004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006
Alexander Belopolsky40018472011-02-26 01:02:56 +00003007PyObject *
3008PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003009{
Victor Stinner8faf8212011-12-08 22:14:11 +01003010 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 PyErr_SetString(PyExc_ValueError,
3012 "chr() arg not in range(0x110000)");
3013 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003014 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003015
Victor Stinner985a82a2014-01-03 12:53:47 +01003016 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003017}
3018
Alexander Belopolsky40018472011-02-26 01:02:56 +00003019PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003020PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003022 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003024 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003025 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003026 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 Py_INCREF(obj);
3028 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003029 }
3030 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 /* For a Unicode subtype that's not a Unicode object,
3032 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003033 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003035 PyErr_Format(PyExc_TypeError,
3036 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003037 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003038 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003039}
3040
Alexander Belopolsky40018472011-02-26 01:02:56 +00003041PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003042PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003043 const char *encoding,
3044 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003045{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003046 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003047 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003048
Guido van Rossumd57fd912000-03-10 22:53:23 +00003049 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 PyErr_BadInternalCall();
3051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003053
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 /* Decoding bytes objects is the most common case and should be fast */
3055 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003056 if (PyBytes_GET_SIZE(obj) == 0)
3057 _Py_RETURN_UNICODE_EMPTY();
3058 v = PyUnicode_Decode(
3059 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3060 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 return v;
3062 }
3063
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003064 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_SetString(PyExc_TypeError,
3066 "decoding str is not supported");
3067 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003068 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3071 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3072 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02003073 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003074 Py_TYPE(obj)->tp_name);
3075 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003076 }
Tim Petersced69f82003-09-16 20:30:58 +00003077
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003078 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003079 PyBuffer_Release(&buffer);
3080 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003082
Serhiy Storchaka05997252013-01-26 12:14:02 +02003083 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003084 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003085 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086}
3087
Victor Stinner600d3be2010-06-10 12:00:55 +00003088/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003089 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3090 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003091int
3092_Py_normalize_encoding(const char *encoding,
3093 char *lower,
3094 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003096 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003097 char *l;
3098 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003099
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003100 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003101 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003102 if (lower_len < 6)
3103 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003104 strcpy(lower, "utf-8");
3105 return 1;
3106 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003107 e = encoding;
3108 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003109 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003110 while (*e) {
3111 if (l == l_end)
3112 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003113 if (Py_ISUPPER(*e)) {
3114 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003115 }
3116 else if (*e == '_') {
3117 *l++ = '-';
3118 e++;
3119 }
3120 else {
3121 *l++ = *e++;
3122 }
3123 }
3124 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003125 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003126}
3127
Alexander Belopolsky40018472011-02-26 01:02:56 +00003128PyObject *
3129PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003130 Py_ssize_t size,
3131 const char *encoding,
3132 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003133{
3134 PyObject *buffer = NULL, *unicode;
3135 Py_buffer info;
3136 char lower[11]; /* Enough for any encoding shortcut */
3137
Fred Drakee4315f52000-05-09 19:53:39 +00003138 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003139 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003140 if ((strcmp(lower, "utf-8") == 0) ||
3141 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003142 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003143 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003144 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003145 (strcmp(lower, "iso-8859-1") == 0) ||
3146 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003147 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003148#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003149 else if (strcmp(lower, "mbcs") == 0)
3150 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003151#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003152 else if (strcmp(lower, "ascii") == 0)
3153 return PyUnicode_DecodeASCII(s, size, errors);
3154 else if (strcmp(lower, "utf-16") == 0)
3155 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3156 else if (strcmp(lower, "utf-32") == 0)
3157 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159
3160 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003161 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003162 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003163 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003164 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 if (buffer == NULL)
3166 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003167 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168 if (unicode == NULL)
3169 goto onError;
3170 if (!PyUnicode_Check(unicode)) {
3171 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003172 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3173 "use codecs.decode() to decode to arbitrary types",
3174 encoding,
3175 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 Py_DECREF(unicode);
3177 goto onError;
3178 }
3179 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003180 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003181
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 Py_XDECREF(buffer);
3184 return NULL;
3185}
3186
Alexander Belopolsky40018472011-02-26 01:02:56 +00003187PyObject *
3188PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003189 const char *encoding,
3190 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003191{
3192 PyObject *v;
3193
3194 if (!PyUnicode_Check(unicode)) {
3195 PyErr_BadArgument();
3196 goto onError;
3197 }
3198
3199 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003201
3202 /* Decode via the codec registry */
3203 v = PyCodec_Decode(unicode, encoding, errors);
3204 if (v == NULL)
3205 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003206 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003207
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003209 return NULL;
3210}
3211
Alexander Belopolsky40018472011-02-26 01:02:56 +00003212PyObject *
3213PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003214 const char *encoding,
3215 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003216{
3217 PyObject *v;
3218
3219 if (!PyUnicode_Check(unicode)) {
3220 PyErr_BadArgument();
3221 goto onError;
3222 }
3223
3224 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003226
3227 /* Decode via the codec registry */
3228 v = PyCodec_Decode(unicode, encoding, errors);
3229 if (v == NULL)
3230 goto onError;
3231 if (!PyUnicode_Check(v)) {
3232 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003233 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3234 "use codecs.decode() to decode to arbitrary types",
3235 encoding,
3236 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003237 Py_DECREF(v);
3238 goto onError;
3239 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003240 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003241
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003243 return NULL;
3244}
3245
Alexander Belopolsky40018472011-02-26 01:02:56 +00003246PyObject *
3247PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003248 Py_ssize_t size,
3249 const char *encoding,
3250 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251{
3252 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003253
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 unicode = PyUnicode_FromUnicode(s, size);
3255 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3258 Py_DECREF(unicode);
3259 return v;
3260}
3261
Alexander Belopolsky40018472011-02-26 01:02:56 +00003262PyObject *
3263PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003264 const char *encoding,
3265 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003266{
3267 PyObject *v;
3268
3269 if (!PyUnicode_Check(unicode)) {
3270 PyErr_BadArgument();
3271 goto onError;
3272 }
3273
3274 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003276
3277 /* Encode via the codec registry */
3278 v = PyCodec_Encode(unicode, encoding, errors);
3279 if (v == NULL)
3280 goto onError;
3281 return v;
3282
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003284 return NULL;
3285}
3286
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003287static size_t
3288wcstombs_errorpos(const wchar_t *wstr)
3289{
3290 size_t len;
3291#if SIZEOF_WCHAR_T == 2
3292 wchar_t buf[3];
3293#else
3294 wchar_t buf[2];
3295#endif
3296 char outbuf[MB_LEN_MAX];
3297 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003298
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003299#if SIZEOF_WCHAR_T == 2
3300 buf[2] = 0;
3301#else
3302 buf[1] = 0;
3303#endif
3304 start = wstr;
3305 while (*wstr != L'\0')
3306 {
3307 previous = wstr;
3308#if SIZEOF_WCHAR_T == 2
3309 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3310 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3311 {
3312 buf[0] = wstr[0];
3313 buf[1] = wstr[1];
3314 wstr += 2;
3315 }
3316 else {
3317 buf[0] = *wstr;
3318 buf[1] = 0;
3319 wstr++;
3320 }
3321#else
3322 buf[0] = *wstr;
3323 wstr++;
3324#endif
3325 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003326 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003327 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003328 }
3329
3330 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003331 return 0;
3332}
3333
Victor Stinner1b579672011-12-17 05:47:23 +01003334static int
3335locale_error_handler(const char *errors, int *surrogateescape)
3336{
Victor Stinner50149202015-09-22 00:26:54 +02003337 _Py_error_handler error_handler = get_error_handler(errors);
3338 switch (error_handler)
3339 {
3340 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003341 *surrogateescape = 0;
3342 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003343 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003344 *surrogateescape = 1;
3345 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003346 default:
3347 PyErr_Format(PyExc_ValueError,
3348 "only 'strict' and 'surrogateescape' error handlers "
3349 "are supported, not '%s'",
3350 errors);
3351 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003352 }
Victor Stinner1b579672011-12-17 05:47:23 +01003353}
3354
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003355PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003356PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003357{
3358 Py_ssize_t wlen, wlen2;
3359 wchar_t *wstr;
3360 PyObject *bytes = NULL;
3361 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003362 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003363 PyObject *exc;
3364 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003365 int surrogateescape;
3366
3367 if (locale_error_handler(errors, &surrogateescape) < 0)
3368 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003369
3370 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3371 if (wstr == NULL)
3372 return NULL;
3373
3374 wlen2 = wcslen(wstr);
3375 if (wlen2 != wlen) {
3376 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003377 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 return NULL;
3379 }
3380
3381 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003382 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 char *str;
3384
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003385 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 if (str == NULL) {
3387 if (error_pos == (size_t)-1) {
3388 PyErr_NoMemory();
3389 PyMem_Free(wstr);
3390 return NULL;
3391 }
3392 else {
3393 goto encode_error;
3394 }
3395 }
3396 PyMem_Free(wstr);
3397
3398 bytes = PyBytes_FromString(str);
3399 PyMem_Free(str);
3400 }
3401 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003402 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003403 size_t len, len2;
3404
3405 len = wcstombs(NULL, wstr, 0);
3406 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003407 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003408 goto encode_error;
3409 }
3410
3411 bytes = PyBytes_FromStringAndSize(NULL, len);
3412 if (bytes == NULL) {
3413 PyMem_Free(wstr);
3414 return NULL;
3415 }
3416
3417 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3418 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003419 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003420 goto encode_error;
3421 }
3422 PyMem_Free(wstr);
3423 }
3424 return bytes;
3425
3426encode_error:
3427 errmsg = strerror(errno);
3428 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003429
3430 if (error_pos == (size_t)-1)
3431 error_pos = wcstombs_errorpos(wstr);
3432
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003433 PyMem_Free(wstr);
3434 Py_XDECREF(bytes);
3435
Victor Stinner2f197072011-12-17 07:08:30 +01003436 if (errmsg != NULL) {
3437 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003438 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003439 if (wstr != NULL) {
3440 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003441 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003442 } else
3443 errmsg = NULL;
3444 }
3445 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003446 reason = PyUnicode_FromString(
3447 "wcstombs() encountered an unencodable "
3448 "wide character");
3449 if (reason == NULL)
3450 return NULL;
3451
3452 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3453 "locale", unicode,
3454 (Py_ssize_t)error_pos,
3455 (Py_ssize_t)(error_pos+1),
3456 reason);
3457 Py_DECREF(reason);
3458 if (exc != NULL) {
3459 PyCodec_StrictErrors(exc);
3460 Py_XDECREF(exc);
3461 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003462 return NULL;
3463}
3464
Victor Stinnerad158722010-10-27 00:25:46 +00003465PyObject *
3466PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003467{
Victor Stinner99b95382011-07-04 14:23:54 +02003468#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003469 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003470#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003472#else
Victor Stinner793b5312011-04-27 00:24:21 +02003473 PyInterpreterState *interp = PyThreadState_GET()->interp;
3474 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3475 cannot use it to encode and decode filenames before it is loaded. Load
3476 the Python codec requires to encode at least its own filename. Use the C
3477 version of the locale codec until the codec registry is initialized and
3478 the Python codec is loaded.
3479
3480 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3481 cannot only rely on it: check also interp->fscodec_initialized for
3482 subinterpreters. */
3483 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003484 return PyUnicode_AsEncodedString(unicode,
3485 Py_FileSystemDefaultEncoding,
3486 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003487 }
3488 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003489 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003490 }
Victor Stinnerad158722010-10-27 00:25:46 +00003491#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494PyObject *
3495PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003496 const char *encoding,
3497 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498{
3499 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003500 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003501
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 if (!PyUnicode_Check(unicode)) {
3503 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 }
Fred Drakee4315f52000-05-09 19:53:39 +00003506
Fred Drakee4315f52000-05-09 19:53:39 +00003507 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003508 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003509 if ((strcmp(lower, "utf-8") == 0) ||
3510 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003511 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003512 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003514 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003516 }
Victor Stinner37296e82010-06-10 13:36:23 +00003517 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003518 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003519 (strcmp(lower, "iso-8859-1") == 0) ||
3520 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003522#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003523 else if (strcmp(lower, "mbcs") == 0)
3524 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003525#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003526 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003527 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003529
3530 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003531 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003533 return NULL;
3534
3535 /* The normal path */
3536 if (PyBytes_Check(v))
3537 return v;
3538
3539 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003540 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003541 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003542 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003543
3544 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003545 "encoder %s returned bytearray instead of bytes; "
3546 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003547 encoding);
3548 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003549 Py_DECREF(v);
3550 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003551 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003552
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003553 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3554 Py_DECREF(v);
3555 return b;
3556 }
3557
3558 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003559 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3560 "use codecs.encode() to encode to arbitrary types",
3561 encoding,
3562 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003563 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003564 return NULL;
3565}
3566
Alexander Belopolsky40018472011-02-26 01:02:56 +00003567PyObject *
3568PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003569 const char *encoding,
3570 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003571{
3572 PyObject *v;
3573
3574 if (!PyUnicode_Check(unicode)) {
3575 PyErr_BadArgument();
3576 goto onError;
3577 }
3578
3579 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003580 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003581
3582 /* Encode via the codec registry */
3583 v = PyCodec_Encode(unicode, encoding, errors);
3584 if (v == NULL)
3585 goto onError;
3586 if (!PyUnicode_Check(v)) {
3587 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003588 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3589 "use codecs.encode() to encode to arbitrary types",
3590 encoding,
3591 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003592 Py_DECREF(v);
3593 goto onError;
3594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003596
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003598 return NULL;
3599}
3600
Victor Stinner2f197072011-12-17 07:08:30 +01003601static size_t
3602mbstowcs_errorpos(const char *str, size_t len)
3603{
3604#ifdef HAVE_MBRTOWC
3605 const char *start = str;
3606 mbstate_t mbs;
3607 size_t converted;
3608 wchar_t ch;
3609
3610 memset(&mbs, 0, sizeof mbs);
3611 while (len)
3612 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003613 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003614 if (converted == 0)
3615 /* Reached end of string */
3616 break;
3617 if (converted == (size_t)-1 || converted == (size_t)-2) {
3618 /* Conversion error or incomplete character */
3619 return str - start;
3620 }
3621 else {
3622 str += converted;
3623 len -= converted;
3624 }
3625 }
3626 /* failed to find the undecodable byte sequence */
3627 return 0;
3628#endif
3629 return 0;
3630}
3631
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003632PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003633PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003634 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003635{
3636 wchar_t smallbuf[256];
3637 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3638 wchar_t *wstr;
3639 size_t wlen, wlen2;
3640 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003641 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003642 size_t error_pos;
3643 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003644 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3645 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003646
3647 if (locale_error_handler(errors, &surrogateescape) < 0)
3648 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003649
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003650 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3651 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003652 return NULL;
3653 }
3654
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003655 if (surrogateescape) {
3656 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003657 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003658 if (wstr == NULL) {
3659 if (wlen == (size_t)-1)
3660 PyErr_NoMemory();
3661 else
3662 PyErr_SetFromErrno(PyExc_OSError);
3663 return NULL;
3664 }
3665
3666 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003667 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003668 }
3669 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003670 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003671#ifndef HAVE_BROKEN_MBSTOWCS
3672 wlen = mbstowcs(NULL, str, 0);
3673#else
3674 wlen = len;
3675#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003676 if (wlen == (size_t)-1)
3677 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003678 if (wlen+1 <= smallbuf_len) {
3679 wstr = smallbuf;
3680 }
3681 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003682 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003683 if (!wstr)
3684 return PyErr_NoMemory();
3685 }
3686
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003687 wlen2 = mbstowcs(wstr, str, wlen+1);
3688 if (wlen2 == (size_t)-1) {
3689 if (wstr != smallbuf)
3690 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003691 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003692 }
3693#ifdef HAVE_BROKEN_MBSTOWCS
3694 assert(wlen2 == wlen);
3695#endif
3696 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3697 if (wstr != smallbuf)
3698 PyMem_Free(wstr);
3699 }
3700 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003701
3702decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003703 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003704 errmsg = strerror(errno);
3705 assert(errmsg != NULL);
3706
3707 error_pos = mbstowcs_errorpos(str, len);
3708 if (errmsg != NULL) {
3709 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003710 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003711 if (wstr != NULL) {
3712 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003713 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003714 }
Victor Stinner2f197072011-12-17 07:08:30 +01003715 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003716 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003717 reason = PyUnicode_FromString(
3718 "mbstowcs() encountered an invalid multibyte sequence");
3719 if (reason == NULL)
3720 return NULL;
3721
3722 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3723 "locale", str, len,
3724 (Py_ssize_t)error_pos,
3725 (Py_ssize_t)(error_pos+1),
3726 reason);
3727 Py_DECREF(reason);
3728 if (exc != NULL) {
3729 PyCodec_StrictErrors(exc);
3730 Py_XDECREF(exc);
3731 }
3732 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003733}
3734
3735PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003736PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003737{
3738 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003739 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003740}
3741
3742
3743PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003744PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003745 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003746 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3747}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003748
Christian Heimes5894ba72007-11-04 11:43:14 +00003749PyObject*
3750PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3751{
Victor Stinner99b95382011-07-04 14:23:54 +02003752#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003753 return PyUnicode_DecodeMBCS(s, size, NULL);
3754#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003755 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003756#else
Victor Stinner793b5312011-04-27 00:24:21 +02003757 PyInterpreterState *interp = PyThreadState_GET()->interp;
3758 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3759 cannot use it to encode and decode filenames before it is loaded. Load
3760 the Python codec requires to encode at least its own filename. Use the C
3761 version of the locale codec until the codec registry is initialized and
3762 the Python codec is loaded.
3763
3764 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3765 cannot only rely on it: check also interp->fscodec_initialized for
3766 subinterpreters. */
3767 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003768 return PyUnicode_Decode(s, size,
3769 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003770 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003771 }
3772 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003773 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003774 }
Victor Stinnerad158722010-10-27 00:25:46 +00003775#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003776}
3777
Martin v. Löwis011e8422009-05-05 04:43:17 +00003778
3779int
3780PyUnicode_FSConverter(PyObject* arg, void* addr)
3781{
3782 PyObject *output = NULL;
3783 Py_ssize_t size;
3784 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003785 if (arg == NULL) {
3786 Py_DECREF(*(PyObject**)addr);
3787 return 1;
3788 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003789 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003790 output = arg;
3791 Py_INCREF(output);
3792 }
3793 else {
3794 arg = PyUnicode_FromObject(arg);
3795 if (!arg)
3796 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003797 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003798 Py_DECREF(arg);
3799 if (!output)
3800 return 0;
3801 if (!PyBytes_Check(output)) {
3802 Py_DECREF(output);
3803 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3804 return 0;
3805 }
3806 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003807 size = PyBytes_GET_SIZE(output);
3808 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003809 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003810 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003811 Py_DECREF(output);
3812 return 0;
3813 }
3814 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003815 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003816}
3817
3818
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003819int
3820PyUnicode_FSDecoder(PyObject* arg, void* addr)
3821{
3822 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003823 if (arg == NULL) {
3824 Py_DECREF(*(PyObject**)addr);
3825 return 1;
3826 }
3827 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003828 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003830 output = arg;
3831 Py_INCREF(output);
3832 }
3833 else {
3834 arg = PyBytes_FromObject(arg);
3835 if (!arg)
3836 return 0;
3837 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3838 PyBytes_GET_SIZE(arg));
3839 Py_DECREF(arg);
3840 if (!output)
3841 return 0;
3842 if (!PyUnicode_Check(output)) {
3843 Py_DECREF(output);
3844 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3845 return 0;
3846 }
3847 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003848 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003849 Py_DECREF(output);
3850 return 0;
3851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003853 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003854 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003855 Py_DECREF(output);
3856 return 0;
3857 }
3858 *(PyObject**)addr = output;
3859 return Py_CLEANUP_SUPPORTED;
3860}
3861
3862
Martin v. Löwis5b222132007-06-10 09:51:05 +00003863char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003865{
Christian Heimesf3863112007-11-22 07:46:41 +00003866 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003868 if (!PyUnicode_Check(unicode)) {
3869 PyErr_BadArgument();
3870 return NULL;
3871 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003873 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003875 if (PyUnicode_UTF8(unicode) == NULL) {
3876 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3878 if (bytes == NULL)
3879 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3881 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003882 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 Py_DECREF(bytes);
3884 return NULL;
3885 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003886 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3887 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3888 PyBytes_AS_STRING(bytes),
3889 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 Py_DECREF(bytes);
3891 }
3892
3893 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003894 *psize = PyUnicode_UTF8_LENGTH(unicode);
3895 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003896}
3897
3898char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3902}
3903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904Py_UNICODE *
3905PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 const unsigned char *one_byte;
3908#if SIZEOF_WCHAR_T == 4
3909 const Py_UCS2 *two_bytes;
3910#else
3911 const Py_UCS4 *four_bytes;
3912 const Py_UCS4 *ucs4_end;
3913 Py_ssize_t num_surrogates;
3914#endif
3915 wchar_t *w;
3916 wchar_t *wchar_end;
3917
3918 if (!PyUnicode_Check(unicode)) {
3919 PyErr_BadArgument();
3920 return NULL;
3921 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003922 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003924 assert(_PyUnicode_KIND(unicode) != 0);
3925 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003927 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003929 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3930 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 num_surrogates = 0;
3932
3933 for (; four_bytes < ucs4_end; ++four_bytes) {
3934 if (*four_bytes > 0xFFFF)
3935 ++num_surrogates;
3936 }
3937
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003938 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3939 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3940 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 PyErr_NoMemory();
3942 return NULL;
3943 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003944 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003946 w = _PyUnicode_WSTR(unicode);
3947 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3948 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3950 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003951 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003953 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3954 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 }
3956 else
3957 *w = *four_bytes;
3958
3959 if (w > wchar_end) {
3960 assert(0 && "Miscalculated string end");
3961 }
3962 }
3963 *w = 0;
3964#else
3965 /* sizeof(wchar_t) == 4 */
3966 Py_FatalError("Impossible unicode object state, wstr and str "
3967 "should share memory already.");
3968 return NULL;
3969#endif
3970 }
3971 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003972 if ((size_t)_PyUnicode_LENGTH(unicode) >
3973 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3974 PyErr_NoMemory();
3975 return NULL;
3976 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003977 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3978 (_PyUnicode_LENGTH(unicode) + 1));
3979 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 PyErr_NoMemory();
3981 return NULL;
3982 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003983 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3984 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3985 w = _PyUnicode_WSTR(unicode);
3986 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003988 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3989 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 for (; w < wchar_end; ++one_byte, ++w)
3991 *w = *one_byte;
3992 /* null-terminate the wstr */
3993 *w = 0;
3994 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003997 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 for (; w < wchar_end; ++two_bytes, ++w)
3999 *w = *two_bytes;
4000 /* null-terminate the wstr */
4001 *w = 0;
4002#else
4003 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004004 PyObject_FREE(_PyUnicode_WSTR(unicode));
4005 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 Py_FatalError("Impossible unicode object state, wstr "
4007 "and str should share memory already.");
4008 return NULL;
4009#endif
4010 }
4011 else {
4012 assert(0 && "This should never happen.");
4013 }
4014 }
4015 }
4016 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 *size = PyUnicode_WSTR_LENGTH(unicode);
4018 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004019}
4020
Alexander Belopolsky40018472011-02-26 01:02:56 +00004021Py_UNICODE *
4022PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025}
4026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027
Alexander Belopolsky40018472011-02-26 01:02:56 +00004028Py_ssize_t
4029PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030{
4031 if (!PyUnicode_Check(unicode)) {
4032 PyErr_BadArgument();
4033 goto onError;
4034 }
4035 return PyUnicode_GET_SIZE(unicode);
4036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return -1;
4039}
4040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041Py_ssize_t
4042PyUnicode_GetLength(PyObject *unicode)
4043{
Victor Stinner07621332012-06-16 04:53:46 +02004044 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 PyErr_BadArgument();
4046 return -1;
4047 }
Victor Stinner07621332012-06-16 04:53:46 +02004048 if (PyUnicode_READY(unicode) == -1)
4049 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 return PyUnicode_GET_LENGTH(unicode);
4051}
4052
4053Py_UCS4
4054PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4055{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004056 void *data;
4057 int kind;
4058
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004059 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4060 PyErr_BadArgument();
4061 return (Py_UCS4)-1;
4062 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004063 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004064 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 return (Py_UCS4)-1;
4066 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004067 data = PyUnicode_DATA(unicode);
4068 kind = PyUnicode_KIND(unicode);
4069 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070}
4071
4072int
4073PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4074{
4075 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004076 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 return -1;
4078 }
Victor Stinner488fa492011-12-12 00:01:39 +01004079 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004080 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004081 PyErr_SetString(PyExc_IndexError, "string index out of range");
4082 return -1;
4083 }
Victor Stinner488fa492011-12-12 00:01:39 +01004084 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004085 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004086 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4087 PyErr_SetString(PyExc_ValueError, "character out of range");
4088 return -1;
4089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4091 index, ch);
4092 return 0;
4093}
4094
Alexander Belopolsky40018472011-02-26 01:02:56 +00004095const char *
4096PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004097{
Victor Stinner42cb4622010-09-01 19:39:01 +00004098 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004099}
4100
Victor Stinner554f3f02010-06-16 23:33:54 +00004101/* create or adjust a UnicodeDecodeError */
4102static void
4103make_decode_exception(PyObject **exceptionObject,
4104 const char *encoding,
4105 const char *input, Py_ssize_t length,
4106 Py_ssize_t startpos, Py_ssize_t endpos,
4107 const char *reason)
4108{
4109 if (*exceptionObject == NULL) {
4110 *exceptionObject = PyUnicodeDecodeError_Create(
4111 encoding, input, length, startpos, endpos, reason);
4112 }
4113 else {
4114 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4115 goto onError;
4116 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4117 goto onError;
4118 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4119 goto onError;
4120 }
4121 return;
4122
4123onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004124 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004125}
4126
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004127#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128/* error handling callback helper:
4129 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004130 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 and adjust various state variables.
4132 return 0 on success, -1 on error
4133*/
4134
Alexander Belopolsky40018472011-02-26 01:02:56 +00004135static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004136unicode_decode_call_errorhandler_wchar(
4137 const char *errors, PyObject **errorHandler,
4138 const char *encoding, const char *reason,
4139 const char **input, const char **inend, Py_ssize_t *startinpos,
4140 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4141 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004143 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004144
4145 PyObject *restuple = NULL;
4146 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004147 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004148 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004149 Py_ssize_t requiredsize;
4150 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004152 wchar_t *repwstr;
4153 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004155 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4156 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 *errorHandler = PyCodec_LookupError(errors);
4160 if (*errorHandler == NULL)
4161 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 }
4163
Victor Stinner554f3f02010-06-16 23:33:54 +00004164 make_decode_exception(exceptionObject,
4165 encoding,
4166 *input, *inend - *input,
4167 *startinpos, *endinpos,
4168 reason);
4169 if (*exceptionObject == NULL)
4170 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171
4172 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4173 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004176 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178 }
4179 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004181
4182 /* Copy back the bytes variables, which might have been modified by the
4183 callback */
4184 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4185 if (!inputobj)
4186 goto onError;
4187 if (!PyBytes_Check(inputobj)) {
4188 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4189 }
4190 *input = PyBytes_AS_STRING(inputobj);
4191 insize = PyBytes_GET_SIZE(inputobj);
4192 *inend = *input + insize;
4193 /* we can DECREF safely, as the exception has another reference,
4194 so the object won't go away. */
4195 Py_DECREF(inputobj);
4196
4197 if (newpos<0)
4198 newpos = insize+newpos;
4199 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004200 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004201 goto onError;
4202 }
4203
4204 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4205 if (repwstr == NULL)
4206 goto onError;
4207 /* need more space? (at least enough for what we
4208 have+the replacement+the rest of the string (starting
4209 at the new input position), so we won't have to check space
4210 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004211 requiredsize = *outpos;
4212 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4213 goto overflow;
4214 requiredsize += repwlen;
4215 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4216 goto overflow;
4217 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004219 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004220 requiredsize = 2*outsize;
4221 if (unicode_resize(output, requiredsize) < 0)
4222 goto onError;
4223 }
4224 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4225 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004226 *endinpos = newpos;
4227 *inptr = *input + newpos;
4228
4229 /* we made it! */
4230 Py_XDECREF(restuple);
4231 return 0;
4232
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004233 overflow:
4234 PyErr_SetString(PyExc_OverflowError,
4235 "decoded result is too long for a Python string");
4236
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 onError:
4238 Py_XDECREF(restuple);
4239 return -1;
4240}
4241#endif /* HAVE_MBCS */
4242
4243static int
4244unicode_decode_call_errorhandler_writer(
4245 const char *errors, PyObject **errorHandler,
4246 const char *encoding, const char *reason,
4247 const char **input, const char **inend, Py_ssize_t *startinpos,
4248 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4249 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4250{
4251 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4252
4253 PyObject *restuple = NULL;
4254 PyObject *repunicode = NULL;
4255 Py_ssize_t insize;
4256 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004257 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258 PyObject *inputobj = NULL;
4259
4260 if (*errorHandler == NULL) {
4261 *errorHandler = PyCodec_LookupError(errors);
4262 if (*errorHandler == NULL)
4263 goto onError;
4264 }
4265
4266 make_decode_exception(exceptionObject,
4267 encoding,
4268 *input, *inend - *input,
4269 *startinpos, *endinpos,
4270 reason);
4271 if (*exceptionObject == NULL)
4272 goto onError;
4273
4274 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4275 if (restuple == NULL)
4276 goto onError;
4277 if (!PyTuple_Check(restuple)) {
4278 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4279 goto onError;
4280 }
4281 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004282 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004283
4284 /* Copy back the bytes variables, which might have been modified by the
4285 callback */
4286 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4287 if (!inputobj)
4288 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004289 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004291 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004292 *input = PyBytes_AS_STRING(inputobj);
4293 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004294 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004295 /* we can DECREF safely, as the exception has another reference,
4296 so the object won't go away. */
4297 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004301 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004302 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004304 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305
Victor Stinner8f674cc2013-04-17 23:02:17 +02004306 if (PyUnicode_READY(repunicode) < 0)
4307 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004308 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004309 if (replen > 1) {
4310 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004311 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004312 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4313 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4314 goto onError;
4315 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004317 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004320 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 Py_XDECREF(restuple);
4324 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004325
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004329}
4330
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004331/* --- UTF-7 Codec -------------------------------------------------------- */
4332
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333/* See RFC2152 for details. We encode conservatively and decode liberally. */
4334
4335/* Three simple macros defining base-64. */
4336
4337/* Is c a base-64 character? */
4338
4339#define IS_BASE64(c) \
4340 (((c) >= 'A' && (c) <= 'Z') || \
4341 ((c) >= 'a' && (c) <= 'z') || \
4342 ((c) >= '0' && (c) <= '9') || \
4343 (c) == '+' || (c) == '/')
4344
4345/* given that c is a base-64 character, what is its base-64 value? */
4346
4347#define FROM_BASE64(c) \
4348 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4349 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4350 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4351 (c) == '+' ? 62 : 63)
4352
4353/* What is the base-64 character of the bottom 6 bits of n? */
4354
4355#define TO_BASE64(n) \
4356 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4357
4358/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4359 * decoded as itself. We are permissive on decoding; the only ASCII
4360 * byte not decoding to itself is the + which begins a base64
4361 * string. */
4362
4363#define DECODE_DIRECT(c) \
4364 ((c) <= 127 && (c) != '+')
4365
4366/* The UTF-7 encoder treats ASCII characters differently according to
4367 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4368 * the above). See RFC2152. This array identifies these different
4369 * sets:
4370 * 0 : "Set D"
4371 * alphanumeric and '(),-./:?
4372 * 1 : "Set O"
4373 * !"#$%&*;<=>@[]^_`{|}
4374 * 2 : "whitespace"
4375 * ht nl cr sp
4376 * 3 : special (must be base64 encoded)
4377 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4378 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379
Tim Petersced69f82003-09-16 20:30:58 +00004380static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381char utf7_category[128] = {
4382/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4383 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4384/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4385 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4386/* sp ! " # $ % & ' ( ) * + , - . / */
4387 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4388/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4390/* @ A B C D E F G H I J K L M N O */
4391 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4392/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4393 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4394/* ` a b c d e f g h i j k l m n o */
4395 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4396/* p q r s t u v w x y z { | } ~ del */
4397 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398};
4399
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400/* ENCODE_DIRECT: this character should be encoded as itself. The
4401 * answer depends on whether we are encoding set O as itself, and also
4402 * on whether we are encoding whitespace as itself. RFC2152 makes it
4403 * clear that the answers to these questions vary between
4404 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004405
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406#define ENCODE_DIRECT(c, directO, directWS) \
4407 ((c) < 128 && (c) > 0 && \
4408 ((utf7_category[(c)] == 0) || \
4409 (directWS && (utf7_category[(c)] == 2)) || \
4410 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411
Alexander Belopolsky40018472011-02-26 01:02:56 +00004412PyObject *
4413PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004414 Py_ssize_t size,
4415 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004417 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4418}
4419
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420/* The decoder. The only state we preserve is our read position,
4421 * i.e. how many characters we have consumed. So if we end in the
4422 * middle of a shift sequence we have to back off the read position
4423 * and the output to the beginning of the sequence, otherwise we lose
4424 * all the shift state (seen bits, number of bits seen, high
4425 * surrogate). */
4426
Alexander Belopolsky40018472011-02-26 01:02:56 +00004427PyObject *
4428PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004429 Py_ssize_t size,
4430 const char *errors,
4431 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004432{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004434 Py_ssize_t startinpos;
4435 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004437 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 const char *errmsg = "";
4439 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004440 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441 unsigned int base64bits = 0;
4442 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004443 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 PyObject *errorHandler = NULL;
4445 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004447 if (size == 0) {
4448 if (consumed)
4449 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004450 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004451 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004453 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004454 _PyUnicodeWriter_Init(&writer);
4455 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456
4457 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 e = s + size;
4459
4460 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004461 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004463 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 if (inShift) { /* in a base-64 section */
4466 if (IS_BASE64(ch)) { /* consume a base-64 character */
4467 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4468 base64bits += 6;
4469 s++;
4470 if (base64bits >= 16) {
4471 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004472 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 base64bits -= 16;
4474 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004475 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 if (surrogate) {
4477 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004478 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4479 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004480 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004481 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004483 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 }
4485 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004486 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004487 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 }
4490 }
Victor Stinner551ac952011-11-29 22:58:13 +01004491 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492 /* first surrogate */
4493 surrogate = outCh;
4494 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004496 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004497 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 }
4499 }
4500 }
4501 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503 if (base64bits > 0) { /* left-over bits */
4504 if (base64bits >= 6) {
4505 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004506 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 errmsg = "partial character in shift sequence";
4508 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 else {
4511 /* Some bits remain; they should be zero */
4512 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004513 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 errmsg = "non-zero padding bits in shift sequence";
4515 goto utf7Error;
4516 }
4517 }
4518 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004519 if (surrogate && DECODE_DIRECT(ch)) {
4520 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4521 goto onError;
4522 }
4523 surrogate = 0;
4524 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 /* '-' is absorbed; other terminating
4526 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004527 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 }
4530 }
4531 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 s++; /* consume '+' */
4534 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004536 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004537 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 }
4539 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004540 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004542 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004544 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004545 }
4546 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004549 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004551 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 else {
4553 startinpos = s-starts;
4554 s++;
4555 errmsg = "unexpected special character";
4556 goto utf7Error;
4557 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 errors, &errorHandler,
4563 "utf7", errmsg,
4564 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004565 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004566 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567 }
4568
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 /* end of string */
4570
4571 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4572 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004573 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 if (surrogate ||
4575 (base64bits >= 6) ||
4576 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004578 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 errors, &errorHandler,
4580 "utf7", "unterminated shift sequence",
4581 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004582 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 goto onError;
4584 if (s < e)
4585 goto restart;
4586 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588
4589 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004590 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004592 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004593 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004594 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004595 writer.kind, writer.data, shiftOutStart);
4596 Py_XDECREF(errorHandler);
4597 Py_XDECREF(exc);
4598 _PyUnicodeWriter_Dealloc(&writer);
4599 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004600 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004601 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 }
4603 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004604 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 Py_XDECREF(errorHandler);
4609 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004610 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 Py_XDECREF(errorHandler);
4614 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004615 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004616 return NULL;
4617}
4618
4619
Alexander Belopolsky40018472011-02-26 01:02:56 +00004620PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004621_PyUnicode_EncodeUTF7(PyObject *str,
4622 int base64SetO,
4623 int base64WhiteSpace,
4624 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004626 int kind;
4627 void *data;
4628 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004629 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004631 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 unsigned int base64bits = 0;
4633 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 char * out;
4635 char * start;
4636
Benjamin Petersonbac79492012-01-14 13:34:47 -05004637 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004638 return NULL;
4639 kind = PyUnicode_KIND(str);
4640 data = PyUnicode_DATA(str);
4641 len = PyUnicode_GET_LENGTH(str);
4642
4643 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004646 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004647 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004648 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004649 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 if (v == NULL)
4651 return NULL;
4652
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004653 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004654 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004655 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 if (inShift) {
4658 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4659 /* shifting out */
4660 if (base64bits) { /* output remaining bits */
4661 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4662 base64buffer = 0;
4663 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 }
4665 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 /* Characters not in the BASE64 set implicitly unshift the sequence
4667 so no '-' is required, except if the character is itself a '-' */
4668 if (IS_BASE64(ch) || ch == '-') {
4669 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 *out++ = (char) ch;
4672 }
4673 else {
4674 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004675 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 else { /* not in a shift sequence */
4678 if (ch == '+') {
4679 *out++ = '+';
4680 *out++ = '-';
4681 }
4682 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4683 *out++ = (char) ch;
4684 }
4685 else {
4686 *out++ = '+';
4687 inShift = 1;
4688 goto encode_char;
4689 }
4690 }
4691 continue;
4692encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004694 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004695
Antoine Pitrou244651a2009-05-04 18:56:13 +00004696 /* code first surrogate */
4697 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004698 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 while (base64bits >= 6) {
4700 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4701 base64bits -= 6;
4702 }
4703 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004704 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 base64bits += 16;
4707 base64buffer = (base64buffer << 16) | ch;
4708 while (base64bits >= 6) {
4709 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4710 base64bits -= 6;
4711 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004712 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 if (base64bits)
4714 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4715 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004717 if (_PyBytes_Resize(&v, out - start) < 0)
4718 return NULL;
4719 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004721PyObject *
4722PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4723 Py_ssize_t size,
4724 int base64SetO,
4725 int base64WhiteSpace,
4726 const char *errors)
4727{
4728 PyObject *result;
4729 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4730 if (tmp == NULL)
4731 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004732 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004733 base64WhiteSpace, errors);
4734 Py_DECREF(tmp);
4735 return result;
4736}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004737
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738#undef IS_BASE64
4739#undef FROM_BASE64
4740#undef TO_BASE64
4741#undef DECODE_DIRECT
4742#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744/* --- UTF-8 Codec -------------------------------------------------------- */
4745
Alexander Belopolsky40018472011-02-26 01:02:56 +00004746PyObject *
4747PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004748 Py_ssize_t size,
4749 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Walter Dörwald69652032004-09-07 20:24:22 +00004751 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4752}
4753
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754#include "stringlib/asciilib.h"
4755#include "stringlib/codecs.h"
4756#include "stringlib/undef.h"
4757
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004758#include "stringlib/ucs1lib.h"
4759#include "stringlib/codecs.h"
4760#include "stringlib/undef.h"
4761
4762#include "stringlib/ucs2lib.h"
4763#include "stringlib/codecs.h"
4764#include "stringlib/undef.h"
4765
4766#include "stringlib/ucs4lib.h"
4767#include "stringlib/codecs.h"
4768#include "stringlib/undef.h"
4769
Antoine Pitrouab868312009-01-10 15:40:25 +00004770/* Mask to quickly check whether a C 'long' contains a
4771 non-ASCII, UTF8-encoded char. */
4772#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004773# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004774#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004775# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004776#else
4777# error C 'long' size should be either 4 or 8!
4778#endif
4779
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780static Py_ssize_t
4781ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004784 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004786 /*
4787 * Issue #17237: m68k is a bit different from most architectures in
4788 * that objects do not use "natural alignment" - for example, int and
4789 * long are only aligned at 2-byte boundaries. Therefore the assert()
4790 * won't work; also, tests have shown that skipping the "optimised
4791 * version" will even speed up m68k.
4792 */
4793#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004795 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4796 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 /* Fast path, see in STRINGLIB(utf8_decode) for
4798 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004799 /* Help allocation */
4800 const char *_p = p;
4801 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 while (_p < aligned_end) {
4803 unsigned long value = *(const unsigned long *) _p;
4804 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806 *((unsigned long *)q) = value;
4807 _p += SIZEOF_LONG;
4808 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004809 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004810 p = _p;
4811 while (p < end) {
4812 if ((unsigned char)*p & 0x80)
4813 break;
4814 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004819#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 while (p < end) {
4821 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4822 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004823 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004824 /* Help allocation */
4825 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004826 while (_p < aligned_end) {
4827 unsigned long value = *(unsigned long *) _p;
4828 if (value & ASCII_CHAR_MASK)
4829 break;
4830 _p += SIZEOF_LONG;
4831 }
4832 p = _p;
4833 if (_p == end)
4834 break;
4835 }
4836 if ((unsigned char)*p & 0x80)
4837 break;
4838 ++p;
4839 }
4840 memcpy(dest, start, p - start);
4841 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842}
Antoine Pitrouab868312009-01-10 15:40:25 +00004843
Victor Stinner785938e2011-12-11 20:09:03 +01004844PyObject *
4845PyUnicode_DecodeUTF8Stateful(const char *s,
4846 Py_ssize_t size,
4847 const char *errors,
4848 Py_ssize_t *consumed)
4849{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004850 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004851 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853
4854 Py_ssize_t startinpos;
4855 Py_ssize_t endinpos;
4856 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004857 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004858 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004859 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004860
4861 if (size == 0) {
4862 if (consumed)
4863 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004864 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004865 }
4866
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4868 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004869 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004870 *consumed = 1;
4871 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004872 }
4873
Victor Stinner8f674cc2013-04-17 23:02:17 +02004874 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004875 writer.min_length = size;
4876 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004877 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004878
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 writer.pos = ascii_decode(s, end, writer.data);
4880 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881 while (s < end) {
4882 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004883 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004884
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 if (PyUnicode_IS_ASCII(writer.buffer))
4887 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004889 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004891 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 } else {
4893 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004894 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 }
4896
4897 switch (ch) {
4898 case 0:
4899 if (s == end || consumed)
4900 goto End;
4901 errmsg = "unexpected end of data";
4902 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004903 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 break;
4905 case 1:
4906 errmsg = "invalid start byte";
4907 startinpos = s - starts;
4908 endinpos = startinpos + 1;
4909 break;
4910 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004911 case 3:
4912 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 errmsg = "invalid continuation byte";
4914 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004915 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 break;
4917 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004918 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 goto onError;
4920 continue;
4921 }
4922
Victor Stinner1d65d912015-10-05 13:43:50 +02004923 if (error_handler == _Py_ERROR_UNKNOWN)
4924 error_handler = get_error_handler(errors);
4925
4926 switch (error_handler) {
4927 case _Py_ERROR_IGNORE:
4928 s += (endinpos - startinpos);
4929 break;
4930
4931 case _Py_ERROR_REPLACE:
4932 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4933 goto onError;
4934 s += (endinpos - startinpos);
4935 break;
4936
4937 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004938 {
4939 Py_ssize_t i;
4940
Victor Stinner1d65d912015-10-05 13:43:50 +02004941 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4942 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004943 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004944 ch = (Py_UCS4)(unsigned char)(starts[i]);
4945 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4946 ch + 0xdc00);
4947 writer.pos++;
4948 }
4949 s += (endinpos - startinpos);
4950 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004951 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004952
4953 default:
4954 if (unicode_decode_call_errorhandler_writer(
4955 errors, &error_handler_obj,
4956 "utf-8", errmsg,
4957 &starts, &end, &startinpos, &endinpos, &exc, &s,
4958 &writer))
4959 goto onError;
4960 }
Victor Stinner785938e2011-12-11 20:09:03 +01004961 }
4962
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 if (consumed)
4965 *consumed = s - starts;
4966
Victor Stinner1d65d912015-10-05 13:43:50 +02004967 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970
4971onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004972 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004974 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004976}
4977
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004978#ifdef __APPLE__
4979
4980/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004981 used to decode the command line arguments on Mac OS X.
4982
4983 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004984 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004985
4986wchar_t*
4987_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4988{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004989 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 wchar_t *unicode;
4991 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004992
4993 /* Note: size will always be longer than the resulting Unicode
4994 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004995 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004996 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004997 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004998 if (!unicode)
4999 return NULL;
5000
5001 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005002 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005004 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005006#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005008#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005010#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 if (ch > 0xFF) {
5012#if SIZEOF_WCHAR_T == 4
5013 assert(0);
5014#else
5015 assert(Py_UNICODE_IS_SURROGATE(ch));
5016 /* compute and append the two surrogates: */
5017 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5018 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5019#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005020 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 else {
5022 if (!ch && s == e)
5023 break;
5024 /* surrogateescape */
5025 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5026 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005029 return unicode;
5030}
5031
5032#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005034/* Primary internal function which creates utf8 encoded bytes objects.
5035
5036 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005037 and allocate exactly as much space needed at the end. Else allocate the
5038 maximum possible needed (4 result bytes per Unicode character), and return
5039 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005040*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005041PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005042_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043{
Victor Stinner6099a032011-12-18 14:22:26 +01005044 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005045 void *data;
5046 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 if (!PyUnicode_Check(unicode)) {
5049 PyErr_BadArgument();
5050 return NULL;
5051 }
5052
5053 if (PyUnicode_READY(unicode) == -1)
5054 return NULL;
5055
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005056 if (PyUnicode_UTF8(unicode))
5057 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5058 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005059
5060 kind = PyUnicode_KIND(unicode);
5061 data = PyUnicode_DATA(unicode);
5062 size = PyUnicode_GET_LENGTH(unicode);
5063
Benjamin Petersonead6b532011-12-20 17:23:42 -06005064 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005065 default:
5066 assert(0);
5067 case PyUnicode_1BYTE_KIND:
5068 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5069 assert(!PyUnicode_IS_ASCII(unicode));
5070 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5071 case PyUnicode_2BYTE_KIND:
5072 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5073 case PyUnicode_4BYTE_KIND:
5074 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076}
5077
Alexander Belopolsky40018472011-02-26 01:02:56 +00005078PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005079PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5080 Py_ssize_t size,
5081 const char *errors)
5082{
5083 PyObject *v, *unicode;
5084
5085 unicode = PyUnicode_FromUnicode(s, size);
5086 if (unicode == NULL)
5087 return NULL;
5088 v = _PyUnicode_AsUTF8String(unicode, errors);
5089 Py_DECREF(unicode);
5090 return v;
5091}
5092
5093PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005094PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005096 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097}
5098
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099/* --- UTF-32 Codec ------------------------------------------------------- */
5100
5101PyObject *
5102PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 Py_ssize_t size,
5104 const char *errors,
5105 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106{
5107 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5108}
5109
5110PyObject *
5111PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 Py_ssize_t size,
5113 const char *errors,
5114 int *byteorder,
5115 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116{
5117 const char *starts = s;
5118 Py_ssize_t startinpos;
5119 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005120 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005121 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005122 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005123 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005125 PyObject *errorHandler = NULL;
5126 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005127
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128 q = (unsigned char *)s;
5129 e = q + size;
5130
5131 if (byteorder)
5132 bo = *byteorder;
5133
5134 /* Check for BOM marks (U+FEFF) in the input and adjust current
5135 byte order setting accordingly. In native mode, the leading BOM
5136 mark is skipped, in all other modes, it is copied to the output
5137 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005138 if (bo == 0 && size >= 4) {
5139 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5140 if (bom == 0x0000FEFF) {
5141 bo = -1;
5142 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005144 else if (bom == 0xFFFE0000) {
5145 bo = 1;
5146 q += 4;
5147 }
5148 if (byteorder)
5149 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150 }
5151
Victor Stinnere64322e2012-10-30 23:12:47 +01005152 if (q == e) {
5153 if (consumed)
5154 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005155 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005156 }
5157
Victor Stinnere64322e2012-10-30 23:12:47 +01005158#ifdef WORDS_BIGENDIAN
5159 le = bo < 0;
5160#else
5161 le = bo <= 0;
5162#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005163 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005164
Victor Stinner8f674cc2013-04-17 23:02:17 +02005165 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005166 writer.min_length = (e - q + 3) / 4;
5167 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005168 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005169
Victor Stinnere64322e2012-10-30 23:12:47 +01005170 while (1) {
5171 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005172 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005173
Victor Stinnere64322e2012-10-30 23:12:47 +01005174 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005175 enum PyUnicode_Kind kind = writer.kind;
5176 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005177 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005178 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005179 if (le) {
5180 do {
5181 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5182 if (ch > maxch)
5183 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005184 if (kind != PyUnicode_1BYTE_KIND &&
5185 Py_UNICODE_IS_SURROGATE(ch))
5186 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005187 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005188 q += 4;
5189 } while (q <= last);
5190 }
5191 else {
5192 do {
5193 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5194 if (ch > maxch)
5195 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005196 if (kind != PyUnicode_1BYTE_KIND &&
5197 Py_UNICODE_IS_SURROGATE(ch))
5198 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005199 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005200 q += 4;
5201 } while (q <= last);
5202 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005203 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005204 }
5205
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005206 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005207 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005208 startinpos = ((const char *)q) - starts;
5209 endinpos = startinpos + 4;
5210 }
5211 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005212 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005214 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005216 startinpos = ((const char *)q) - starts;
5217 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005219 else {
5220 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005221 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005222 goto onError;
5223 q += 4;
5224 continue;
5225 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005226 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005227 startinpos = ((const char *)q) - starts;
5228 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005230
5231 /* The remaining input chars are ignored if the callback
5232 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005233 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005235 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005237 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005238 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005239 }
5240
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 Py_XDECREF(errorHandler);
5245 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005246 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005247
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005249 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250 Py_XDECREF(errorHandler);
5251 Py_XDECREF(exc);
5252 return NULL;
5253}
5254
5255PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005256_PyUnicode_EncodeUTF32(PyObject *str,
5257 const char *errors,
5258 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005259{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005260 enum PyUnicode_Kind kind;
5261 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005262 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005263 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005264 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005265#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005266 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005268 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005269#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005270 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005271 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005272 PyObject *errorHandler = NULL;
5273 PyObject *exc = NULL;
5274 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005275
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005276 if (!PyUnicode_Check(str)) {
5277 PyErr_BadArgument();
5278 return NULL;
5279 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005280 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005281 return NULL;
5282 kind = PyUnicode_KIND(str);
5283 data = PyUnicode_DATA(str);
5284 len = PyUnicode_GET_LENGTH(str);
5285
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005286 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005287 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005288 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005289 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290 if (v == NULL)
5291 return NULL;
5292
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005293 /* output buffer is 4-bytes aligned */
5294 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5295 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005296 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005297 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005298 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005299 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005300
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005301 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005302 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005303 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005305 else
5306 encoding = "utf-32";
5307
5308 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005309 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5310 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005311 }
5312
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005313 pos = 0;
5314 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005316
5317 if (kind == PyUnicode_2BYTE_KIND) {
5318 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5319 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005320 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005321 else {
5322 assert(kind == PyUnicode_4BYTE_KIND);
5323 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5324 &out, native_ordering);
5325 }
5326 if (pos == len)
5327 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005328
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005329 rep = unicode_encode_call_errorhandler(
5330 errors, &errorHandler,
5331 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005332 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005333 if (!rep)
5334 goto error;
5335
5336 if (PyBytes_Check(rep)) {
5337 repsize = PyBytes_GET_SIZE(rep);
5338 if (repsize & 3) {
5339 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005340 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005341 "surrogates not allowed");
5342 goto error;
5343 }
5344 moreunits = repsize / 4;
5345 }
5346 else {
5347 assert(PyUnicode_Check(rep));
5348 if (PyUnicode_READY(rep) < 0)
5349 goto error;
5350 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5351 if (!PyUnicode_IS_ASCII(rep)) {
5352 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 "surrogates not allowed");
5355 goto error;
5356 }
5357 }
5358
5359 /* four bytes are reserved for each surrogate */
5360 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005361 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005362 Py_ssize_t morebytes = 4 * (moreunits - 1);
5363 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5364 /* integer overflow */
5365 PyErr_NoMemory();
5366 goto error;
5367 }
5368 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5369 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005370 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005371 }
5372
5373 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5375 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005376 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005377 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005378 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5379 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005380 }
5381
5382 Py_CLEAR(rep);
5383 }
5384
5385 /* Cut back to size actually needed. This is necessary for, for example,
5386 encoding of a string containing isolated surrogates and the 'ignore'
5387 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005389 if (nsize != PyBytes_GET_SIZE(v))
5390 _PyBytes_Resize(&v, nsize);
5391 Py_XDECREF(errorHandler);
5392 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005393 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005394 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005395 error:
5396 Py_XDECREF(rep);
5397 Py_XDECREF(errorHandler);
5398 Py_XDECREF(exc);
5399 Py_XDECREF(v);
5400 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005401}
5402
Alexander Belopolsky40018472011-02-26 01:02:56 +00005403PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005404PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5405 Py_ssize_t size,
5406 const char *errors,
5407 int byteorder)
5408{
5409 PyObject *result;
5410 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5411 if (tmp == NULL)
5412 return NULL;
5413 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5414 Py_DECREF(tmp);
5415 return result;
5416}
5417
5418PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005419PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420{
Victor Stinnerb960b342011-11-20 19:12:52 +01005421 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005422}
5423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424/* --- UTF-16 Codec ------------------------------------------------------- */
5425
Tim Peters772747b2001-08-09 22:21:55 +00005426PyObject *
5427PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 Py_ssize_t size,
5429 const char *errors,
5430 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431{
Walter Dörwald69652032004-09-07 20:24:22 +00005432 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5433}
5434
5435PyObject *
5436PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005437 Py_ssize_t size,
5438 const char *errors,
5439 int *byteorder,
5440 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005441{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005443 Py_ssize_t startinpos;
5444 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005445 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005446 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005447 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005448 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005449 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 PyObject *errorHandler = NULL;
5451 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453
Tim Peters772747b2001-08-09 22:21:55 +00005454 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005455 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456
5457 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005458 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005460 /* Check for BOM marks (U+FEFF) in the input and adjust current
5461 byte order setting accordingly. In native mode, the leading BOM
5462 mark is skipped, in all other modes, it is copied to the output
5463 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005464 if (bo == 0 && size >= 2) {
5465 const Py_UCS4 bom = (q[1] << 8) | q[0];
5466 if (bom == 0xFEFF) {
5467 q += 2;
5468 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005470 else if (bom == 0xFFFE) {
5471 q += 2;
5472 bo = 1;
5473 }
5474 if (byteorder)
5475 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
Antoine Pitrou63065d72012-05-15 23:48:04 +02005478 if (q == e) {
5479 if (consumed)
5480 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005481 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005482 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005483
Christian Heimes743e0cd2012-10-17 23:52:17 +02005484#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005485 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005486 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005487#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005488 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005490#endif
Tim Peters772747b2001-08-09 22:21:55 +00005491
Antoine Pitrou63065d72012-05-15 23:48:04 +02005492 /* Note: size will always be longer than the resulting Unicode
5493 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005494 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005495 writer.min_length = (e - q + 1) / 2;
5496 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005497 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005498
Antoine Pitrou63065d72012-05-15 23:48:04 +02005499 while (1) {
5500 Py_UCS4 ch = 0;
5501 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005502 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005503 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005504 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005505 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005506 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005507 native_ordering);
5508 else
5509 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005510 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005511 native_ordering);
5512 } else if (kind == PyUnicode_2BYTE_KIND) {
5513 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005514 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005515 native_ordering);
5516 } else {
5517 assert(kind == PyUnicode_4BYTE_KIND);
5518 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005520 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005521 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005522 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523
Antoine Pitrou63065d72012-05-15 23:48:04 +02005524 switch (ch)
5525 {
5526 case 0:
5527 /* remaining byte at the end? (size should be even) */
5528 if (q == e || consumed)
5529 goto End;
5530 errmsg = "truncated data";
5531 startinpos = ((const char *)q) - starts;
5532 endinpos = ((const char *)e) - starts;
5533 break;
5534 /* The remaining input chars are ignored if the callback
5535 chooses to skip the input */
5536 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005537 q -= 2;
5538 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005539 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005540 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005541 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005542 endinpos = ((const char *)e) - starts;
5543 break;
5544 case 2:
5545 errmsg = "illegal encoding";
5546 startinpos = ((const char *)q) - 2 - starts;
5547 endinpos = startinpos + 2;
5548 break;
5549 case 3:
5550 errmsg = "illegal UTF-16 surrogate";
5551 startinpos = ((const char *)q) - 4 - starts;
5552 endinpos = startinpos + 2;
5553 break;
5554 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005555 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 continue;
5558 }
5559
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005560 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005561 errors,
5562 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005563 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005564 &starts,
5565 (const char **)&e,
5566 &startinpos,
5567 &endinpos,
5568 &exc,
5569 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005570 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 }
5573
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574End:
Walter Dörwald69652032004-09-07 20:24:22 +00005575 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 Py_XDECREF(errorHandler);
5579 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005580 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005583 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005584 Py_XDECREF(errorHandler);
5585 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 return NULL;
5587}
5588
Tim Peters772747b2001-08-09 22:21:55 +00005589PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005590_PyUnicode_EncodeUTF16(PyObject *str,
5591 const char *errors,
5592 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005594 enum PyUnicode_Kind kind;
5595 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005596 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005597 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005598 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005599 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005600#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005601 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005602#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005603 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005604#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005605 const char *encoding;
5606 Py_ssize_t nsize, pos;
5607 PyObject *errorHandler = NULL;
5608 PyObject *exc = NULL;
5609 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005610
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005611 if (!PyUnicode_Check(str)) {
5612 PyErr_BadArgument();
5613 return NULL;
5614 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005615 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005616 return NULL;
5617 kind = PyUnicode_KIND(str);
5618 data = PyUnicode_DATA(str);
5619 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005620
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005622 if (kind == PyUnicode_4BYTE_KIND) {
5623 const Py_UCS4 *in = (const Py_UCS4 *)data;
5624 const Py_UCS4 *end = in + len;
5625 while (in < end)
5626 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005627 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005628 }
5629 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005631 nsize = len + pairs + (byteorder == 0);
5632 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 if (v == NULL)
5634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005636 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005637 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005638 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005640 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005641 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005642 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005643
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005644 if (kind == PyUnicode_1BYTE_KIND) {
5645 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5646 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005647 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005648
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 if (byteorder < 0)
5650 encoding = "utf-16-le";
5651 else if (byteorder > 0)
5652 encoding = "utf-16-be";
5653 else
5654 encoding = "utf-16";
5655
5656 pos = 0;
5657 while (pos < len) {
5658 Py_ssize_t repsize, moreunits;
5659
5660 if (kind == PyUnicode_2BYTE_KIND) {
5661 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5662 &out, native_ordering);
5663 }
5664 else {
5665 assert(kind == PyUnicode_4BYTE_KIND);
5666 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5667 &out, native_ordering);
5668 }
5669 if (pos == len)
5670 break;
5671
5672 rep = unicode_encode_call_errorhandler(
5673 errors, &errorHandler,
5674 encoding, "surrogates not allowed",
5675 str, &exc, pos, pos + 1, &pos);
5676 if (!rep)
5677 goto error;
5678
5679 if (PyBytes_Check(rep)) {
5680 repsize = PyBytes_GET_SIZE(rep);
5681 if (repsize & 1) {
5682 raise_encode_exception(&exc, encoding,
5683 str, pos - 1, pos,
5684 "surrogates not allowed");
5685 goto error;
5686 }
5687 moreunits = repsize / 2;
5688 }
5689 else {
5690 assert(PyUnicode_Check(rep));
5691 if (PyUnicode_READY(rep) < 0)
5692 goto error;
5693 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5694 if (!PyUnicode_IS_ASCII(rep)) {
5695 raise_encode_exception(&exc, encoding,
5696 str, pos - 1, pos,
5697 "surrogates not allowed");
5698 goto error;
5699 }
5700 }
5701
5702 /* two bytes are reserved for each surrogate */
5703 if (moreunits > 1) {
5704 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5705 Py_ssize_t morebytes = 2 * (moreunits - 1);
5706 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5707 /* integer overflow */
5708 PyErr_NoMemory();
5709 goto error;
5710 }
5711 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5712 goto error;
5713 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5714 }
5715
5716 if (PyBytes_Check(rep)) {
5717 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5718 out += moreunits;
5719 } else /* rep is unicode */ {
5720 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5721 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5722 &out, native_ordering);
5723 }
5724
5725 Py_CLEAR(rep);
5726 }
5727
5728 /* Cut back to size actually needed. This is necessary for, for example,
5729 encoding of a string containing isolated surrogates and the 'ignore' handler
5730 is used. */
5731 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5732 if (nsize != PyBytes_GET_SIZE(v))
5733 _PyBytes_Resize(&v, nsize);
5734 Py_XDECREF(errorHandler);
5735 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005736 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005737 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005738 error:
5739 Py_XDECREF(rep);
5740 Py_XDECREF(errorHandler);
5741 Py_XDECREF(exc);
5742 Py_XDECREF(v);
5743 return NULL;
5744#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5749 Py_ssize_t size,
5750 const char *errors,
5751 int byteorder)
5752{
5753 PyObject *result;
5754 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5755 if (tmp == NULL)
5756 return NULL;
5757 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5758 Py_DECREF(tmp);
5759 return result;
5760}
5761
5762PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005763PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005765 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766}
5767
5768/* --- Unicode Escape Codec ----------------------------------------------- */
5769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5771 if all the escapes in the string make it still a valid ASCII string.
5772 Returns -1 if any escapes were found which cause the string to
5773 pop out of ASCII range. Otherwise returns the length of the
5774 required buffer to hold the string.
5775 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005776static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5778{
5779 const unsigned char *p = (const unsigned char *)s;
5780 const unsigned char *end = p + size;
5781 Py_ssize_t length = 0;
5782
5783 if (size < 0)
5784 return -1;
5785
5786 for (; p < end; ++p) {
5787 if (*p > 127) {
5788 /* Non-ASCII */
5789 return -1;
5790 }
5791 else if (*p != '\\') {
5792 /* Normal character */
5793 ++length;
5794 }
5795 else {
5796 /* Backslash-escape, check next char */
5797 ++p;
5798 /* Escape sequence reaches till end of string or
5799 non-ASCII follow-up. */
5800 if (p >= end || *p > 127)
5801 return -1;
5802 switch (*p) {
5803 case '\n':
5804 /* backslash + \n result in zero characters */
5805 break;
5806 case '\\': case '\'': case '\"':
5807 case 'b': case 'f': case 't':
5808 case 'n': case 'r': case 'v': case 'a':
5809 ++length;
5810 break;
5811 case '0': case '1': case '2': case '3':
5812 case '4': case '5': case '6': case '7':
5813 case 'x': case 'u': case 'U': case 'N':
5814 /* these do not guarantee ASCII characters */
5815 return -1;
5816 default:
5817 /* count the backslash + the other character */
5818 length += 2;
5819 }
5820 }
5821 }
5822 return length;
5823}
5824
Fredrik Lundh06d12682001-01-24 07:59:11 +00005825static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005826
Alexander Belopolsky40018472011-02-26 01:02:56 +00005827PyObject *
5828PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005829 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005830 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005833 Py_ssize_t startinpos;
5834 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005837 char* message;
5838 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 PyObject *errorHandler = NULL;
5840 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005843 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005844 if (len == 0)
5845 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005846
5847 /* After length_of_escaped_ascii_string() there are two alternatives,
5848 either the string is pure ASCII with named escapes like \n, etc.
5849 and we determined it's exact size (common case)
5850 or it contains \x, \u, ... escape sequences. then we create a
5851 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005852 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005854 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 }
5856 else {
5857 /* Escaped strings will always be longer than the resulting
5858 Unicode string, so we start with size here and then reduce the
5859 length after conversion to the true value.
5860 (but if the error callback returns a long replacement string
5861 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005862 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005863 }
5864
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005868
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 while (s < end) {
5870 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005871 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
5874 /* Non-escape characters are interpreted as Unicode ordinals */
5875 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005876 x = (unsigned char)*s;
5877 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005878 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 continue;
5881 }
5882
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 /* \ - Escapes */
5885 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005886 c = *s++;
5887 if (s > end)
5888 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005889
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005890 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005893#define WRITECHAR(ch) \
5894 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005895 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005897 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005898
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005900 case '\\': WRITECHAR('\\'); break;
5901 case '\'': WRITECHAR('\''); break;
5902 case '\"': WRITECHAR('\"'); break;
5903 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005904 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005905 case 'f': WRITECHAR('\014'); break;
5906 case 't': WRITECHAR('\t'); break;
5907 case 'n': WRITECHAR('\n'); break;
5908 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005909 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005910 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005911 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005912 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 case '0': case '1': case '2': case '3':
5916 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005917 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005918 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005919 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005920 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005921 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005923 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 break;
5925
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 /* hex escapes */
5927 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005929 digits = 2;
5930 message = "truncated \\xXX escape";
5931 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005935 digits = 4;
5936 message = "truncated \\uXXXX escape";
5937 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005940 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005941 digits = 8;
5942 message = "truncated \\UXXXXXXXX escape";
5943 hexescape:
5944 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005945 if (end - s < digits) {
5946 /* count only hex digits */
5947 for (; s < end; ++s) {
5948 c = (unsigned char)*s;
5949 if (!Py_ISXDIGIT(c))
5950 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005951 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005952 goto error;
5953 }
5954 for (; digits--; ++s) {
5955 c = (unsigned char)*s;
5956 if (!Py_ISXDIGIT(c))
5957 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005958 chr = (chr<<4) & ~0xF;
5959 if (c >= '0' && c <= '9')
5960 chr += c - '0';
5961 else if (c >= 'a' && c <= 'f')
5962 chr += 10 + c - 'a';
5963 else
5964 chr += 10 + c - 'A';
5965 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005966 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 /* _decoding_error will have already written into the
5968 target buffer. */
5969 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005970 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005971 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005972 message = "illegal Unicode character";
5973 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005974 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005975 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005976 break;
5977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005979 case 'N':
5980 message = "malformed \\N character escape";
5981 if (ucnhash_CAPI == NULL) {
5982 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5984 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005985 if (ucnhash_CAPI == NULL)
5986 goto ucnhashError;
5987 }
5988 if (*s == '{') {
5989 const char *start = s+1;
5990 /* look for the closing brace */
5991 while (*s != '}' && s < end)
5992 s++;
5993 if (s > start && s < end && *s == '}') {
5994 /* found a name. look it up in the unicode database */
5995 message = "unknown Unicode character name";
5996 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005997 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005998 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005999 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006000 goto store;
6001 }
6002 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006003 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006004
6005 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006006 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 message = "\\ at end of string";
6008 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006009 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006010 }
6011 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006013 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006014 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006015 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006017 continue;
6018
6019 error:
6020 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006021 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006022 errors, &errorHandler,
6023 "unicodeescape", message,
6024 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006025 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006026 goto onError;
6027 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006029#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006030
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006031 Py_XDECREF(errorHandler);
6032 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006033 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006034
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006036 PyErr_SetString(
6037 PyExc_UnicodeError,
6038 "\\N escapes not supported (can't load unicodedata module)"
6039 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006040 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006041 Py_XDECREF(errorHandler);
6042 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006043 return NULL;
6044
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006046 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047 Py_XDECREF(errorHandler);
6048 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 return NULL;
6050}
6051
6052/* Return a Unicode-Escape string version of the Unicode object.
6053
6054 If quotes is true, the string is enclosed in u"" or u'' quotes as
6055 appropriate.
6056
6057*/
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006060PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006062 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006063 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065 int kind;
6066 void *data;
6067 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Ezio Melottie7f90372012-10-05 03:33:31 +03006069 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006070 escape.
6071
Ezio Melottie7f90372012-10-05 03:33:31 +03006072 For UCS1 strings it's '\xxx', 4 bytes per source character.
6073 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6074 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006075 */
6076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 if (!PyUnicode_Check(unicode)) {
6078 PyErr_BadArgument();
6079 return NULL;
6080 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006081 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006082 return NULL;
6083 len = PyUnicode_GET_LENGTH(unicode);
6084 kind = PyUnicode_KIND(unicode);
6085 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006086 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006087 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6088 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6089 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6090 }
6091
6092 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006093 return PyBytes_FromStringAndSize(NULL, 0);
6094
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006095 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006097
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006098 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 if (repr == NULL)
6103 return NULL;
6104
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006105 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006108 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006109
Walter Dörwald79e913e2007-05-12 11:08:06 +00006110 /* Escape backslashes */
6111 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 *p++ = '\\';
6113 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006114 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006115 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006116
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006117 /* Map 21-bit characters to '\U00xxxxxx' */
6118 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006119 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006120 *p++ = '\\';
6121 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006122 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6123 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6124 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6125 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6126 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6127 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6128 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6129 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006131 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006134 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 *p++ = '\\';
6136 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006137 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6138 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6139 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6140 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006142
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006143 /* Map special whitespace to '\t', \n', '\r' */
6144 else if (ch == '\t') {
6145 *p++ = '\\';
6146 *p++ = 't';
6147 }
6148 else if (ch == '\n') {
6149 *p++ = '\\';
6150 *p++ = 'n';
6151 }
6152 else if (ch == '\r') {
6153 *p++ = '\\';
6154 *p++ = 'r';
6155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006156
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006157 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006158 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006160 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006161 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6162 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006163 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 /* Copy everything else as-is */
6166 else
6167 *p++ = (char) ch;
6168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006170 assert(p - PyBytes_AS_STRING(repr) > 0);
6171 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6172 return NULL;
6173 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174}
6175
Alexander Belopolsky40018472011-02-26 01:02:56 +00006176PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6178 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 PyObject *result;
6181 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6182 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 result = PyUnicode_AsUnicodeEscapeString(tmp);
6185 Py_DECREF(tmp);
6186 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187}
6188
6189/* --- Raw Unicode Escape Codec ------------------------------------------- */
6190
Alexander Belopolsky40018472011-02-26 01:02:56 +00006191PyObject *
6192PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006193 Py_ssize_t size,
6194 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197 Py_ssize_t startinpos;
6198 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006199 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 const char *end;
6201 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 PyObject *errorHandler = NULL;
6203 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006204
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006205 if (size == 0)
6206 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006207
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 /* Escaped strings will always be longer than the resulting
6209 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006210 length after conversion to the true value. (But decoding error
6211 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006212 _PyUnicodeWriter_Init(&writer);
6213 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006214
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 end = s + size;
6216 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 unsigned char c;
6218 Py_UCS4 x;
6219 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006220 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 /* Non-escape characters are interpreted as Unicode ordinals */
6223 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006224 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006225 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006228 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 startinpos = s-starts;
6230
6231 /* \u-escapes are only interpreted iff the number of leading
6232 backslashes if odd */
6233 bs = s;
6234 for (;s < end;) {
6235 if (*s != '\\')
6236 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006237 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006238 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006239 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 }
6241 if (((s - bs) & 1) == 0 ||
6242 s >= end ||
6243 (*s != 'u' && *s != 'U')) {
6244 continue;
6245 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 count = *s=='u' ? 4 : 8;
6248 s++;
6249
6250 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 for (x = 0, i = 0; i < count; ++i, ++s) {
6252 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006253 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006255 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 errors, &errorHandler,
6257 "rawunicodeescape", "truncated \\uXXXX",
6258 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 goto onError;
6261 goto nextByte;
6262 }
6263 x = (x<<4) & ~0xF;
6264 if (c >= '0' && c <= '9')
6265 x += c - '0';
6266 else if (c >= 'a' && c <= 'f')
6267 x += 10 + c - 'a';
6268 else
6269 x += 10 + c - 'A';
6270 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006271 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006272 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006273 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006274 }
6275 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006276 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006277 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006278 errors, &errorHandler,
6279 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006281 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 nextByte:
6285 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 Py_XDECREF(errorHandler);
6288 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006289 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006290
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006292 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 Py_XDECREF(errorHandler);
6294 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 return NULL;
6296}
6297
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006298
Alexander Belopolsky40018472011-02-26 01:02:56 +00006299PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006300PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006302 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 char *p;
6304 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006305 Py_ssize_t expandsize, pos;
6306 int kind;
6307 void *data;
6308 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310 if (!PyUnicode_Check(unicode)) {
6311 PyErr_BadArgument();
6312 return NULL;
6313 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006314 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315 return NULL;
6316 kind = PyUnicode_KIND(unicode);
6317 data = PyUnicode_DATA(unicode);
6318 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006319 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6320 bytes, and 1 byte characters 4. */
6321 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006322
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006323 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006325
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006326 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 if (repr == NULL)
6328 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006329 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006330 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006332 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006333 for (pos = 0; pos < len; pos++) {
6334 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 /* Map 32-bit characters to '\Uxxxxxxxx' */
6336 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006337 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006338 *p++ = '\\';
6339 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006340 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6341 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6342 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6343 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6344 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6345 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6346 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6347 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006348 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006350 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 *p++ = '\\';
6352 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006353 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6354 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6355 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6356 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 /* Copy everything else as-is */
6359 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 *p++ = (char) ch;
6361 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006362
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006363 assert(p > q);
6364 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006365 return NULL;
6366 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367}
6368
Alexander Belopolsky40018472011-02-26 01:02:56 +00006369PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006370PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6371 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006373 PyObject *result;
6374 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6375 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006376 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006377 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6378 Py_DECREF(tmp);
6379 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380}
6381
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006382/* --- Unicode Internal Codec ------------------------------------------- */
6383
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384PyObject *
6385_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006386 Py_ssize_t size,
6387 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006388{
6389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006390 Py_ssize_t startinpos;
6391 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006392 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006393 const char *end;
6394 const char *reason;
6395 PyObject *errorHandler = NULL;
6396 PyObject *exc = NULL;
6397
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006398 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006399 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006400 1))
6401 return NULL;
6402
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006403 if (size == 0)
6404 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006405
Victor Stinner8f674cc2013-04-17 23:02:17 +02006406 _PyUnicodeWriter_Init(&writer);
6407 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6408 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006410 }
6411 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412
Victor Stinner8f674cc2013-04-17 23:02:17 +02006413 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006414 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006415 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006416 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006417 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006418 endinpos = end-starts;
6419 reason = "truncated input";
6420 goto error;
6421 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006422 /* We copy the raw representation one byte at a time because the
6423 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006424 ((char *) &uch)[0] = s[0];
6425 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006426#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006427 ((char *) &uch)[2] = s[2];
6428 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006429#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006430 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006431#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006432 /* We have to sanity check the raw data, otherwise doom looms for
6433 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006434 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006435 endinpos = s - starts + Py_UNICODE_SIZE;
6436 reason = "illegal code point (> 0x10FFFF)";
6437 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006438 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006439#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006440 s += Py_UNICODE_SIZE;
6441#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006442 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006443 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006444 Py_UNICODE uch2;
6445 ((char *) &uch2)[0] = s[0];
6446 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006447 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006448 {
Victor Stinner551ac952011-11-29 22:58:13 +01006449 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006450 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006451 }
6452 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006453#endif
6454
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006455 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006456 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006457 continue;
6458
6459 error:
6460 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006461 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006462 errors, &errorHandler,
6463 "unicode_internal", reason,
6464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006465 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006466 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467 }
6468
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006469 Py_XDECREF(errorHandler);
6470 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006471 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006472
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006474 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006475 Py_XDECREF(errorHandler);
6476 Py_XDECREF(exc);
6477 return NULL;
6478}
6479
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480/* --- Latin-1 Codec ------------------------------------------------------ */
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
6483PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006484 Py_ssize_t size,
6485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006488 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006492static void
6493make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006494 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006495 PyObject *unicode,
6496 Py_ssize_t startpos, Py_ssize_t endpos,
6497 const char *reason)
6498{
6499 if (*exceptionObject == NULL) {
6500 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006502 encoding, unicode, startpos, endpos, reason);
6503 }
6504 else {
6505 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6506 goto onError;
6507 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6508 goto onError;
6509 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6510 goto onError;
6511 return;
6512 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006513 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006514 }
6515}
6516
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006517/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006518static void
6519raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006520 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006521 PyObject *unicode,
6522 Py_ssize_t startpos, Py_ssize_t endpos,
6523 const char *reason)
6524{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006525 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006526 encoding, unicode, startpos, endpos, reason);
6527 if (*exceptionObject != NULL)
6528 PyCodec_StrictErrors(*exceptionObject);
6529}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006530
6531/* error handling callback helper:
6532 build arguments, call the callback and check the arguments,
6533 put the result into newpos and return the replacement string, which
6534 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006535static PyObject *
6536unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006537 PyObject **errorHandler,
6538 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006540 Py_ssize_t startpos, Py_ssize_t endpos,
6541 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006542{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006543 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006545 PyObject *restuple;
6546 PyObject *resunicode;
6547
6548 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 }
6553
Benjamin Petersonbac79492012-01-14 13:34:47 -05006554 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006555 return NULL;
6556 len = PyUnicode_GET_LENGTH(unicode);
6557
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006558 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006560 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562
6563 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006568 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 Py_DECREF(restuple);
6570 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006571 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006572 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 &resunicode, newpos)) {
6574 Py_DECREF(restuple);
6575 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006577 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6578 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6579 Py_DECREF(restuple);
6580 return NULL;
6581 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006582 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006583 *newpos = len + *newpos;
6584 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006585 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 Py_DECREF(restuple);
6587 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006588 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006589 Py_INCREF(resunicode);
6590 Py_DECREF(restuple);
6591 return resunicode;
6592}
6593
Alexander Belopolsky40018472011-02-26 01:02:56 +00006594static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006596 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006597 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006598{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006599 /* input state */
6600 Py_ssize_t pos=0, size;
6601 int kind;
6602 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 /* pointer into the output */
6604 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006605 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6606 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006607 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006609 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006610 /* output object */
6611 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006612
Benjamin Petersonbac79492012-01-14 13:34:47 -05006613 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614 return NULL;
6615 size = PyUnicode_GET_LENGTH(unicode);
6616 kind = PyUnicode_KIND(unicode);
6617 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006618 /* allocate enough for a simple encoding without
6619 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006620 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006621 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006622
6623 _PyBytesWriter_Init(&writer);
6624 str = _PyBytesWriter_Alloc(&writer, size);
6625 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006626 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006629 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006632 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006634 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006636 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 PyObject *repunicode;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006639 Py_ssize_t repsize, newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006640 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006642 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006644
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006645 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006647
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006648 /* Only overallocate the buffer if it's not the last write */
6649 writer.overallocate = (collend < size);
6650
Benjamin Peterson29060642009-01-31 22:14:21 +00006651 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006652 if (error_handler == _Py_ERROR_UNKNOWN)
6653 error_handler = get_error_handler(errors);
6654
6655 switch (error_handler) {
6656 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006657 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006659
6660 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006661 memset(str, '?', collend - collstart);
6662 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006663 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006664 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006665 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 break;
Victor Stinner50149202015-09-22 00:26:54 +02006667
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006668 case _Py_ERROR_BACKSLASHREPLACE:
6669 str = backslashreplace(&writer, 1, str,
6670 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006671 if (str == NULL)
6672 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006673 pos = collend;
6674 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006675
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006676 case _Py_ERROR_XMLCHARREFREPLACE:
6677 str = xmlcharrefreplace(&writer, 1, str,
6678 unicode, collstart, collend);
6679 if (str == NULL)
6680 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 break;
Victor Stinner50149202015-09-22 00:26:54 +02006683
Victor Stinnerc3713e92015-09-29 12:32:13 +02006684 case _Py_ERROR_SURROGATEESCAPE:
6685 for (i = collstart; i < collend; ++i) {
6686 ch = PyUnicode_READ(kind, data, i);
6687 if (ch < 0xdc80 || 0xdcff < ch) {
6688 /* Not a UTF-8b surrogate */
6689 break;
6690 }
6691 *str++ = (char)(ch - 0xdc00);
6692 ++pos;
6693 }
6694 if (i >= collend)
6695 break;
6696 collstart = pos;
6697 assert(collstart != collend);
6698 /* fallback to general error handling */
6699
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 default:
Victor Stinner50149202015-09-22 00:26:54 +02006701 repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 encoding, reason, unicode, &exc,
6703 collstart, collend, &newpos);
6704 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006705 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006707
Martin v. Löwis011e8422009-05-05 04:43:17 +00006708 if (PyBytes_Check(repunicode)) {
6709 /* Directly copy bytes result to output. */
6710 repsize = PyBytes_Size(repunicode);
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006711 if (repsize > 1) {
6712 str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
6713 if (str == NULL)
6714 goto onError;
6715 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006716 memcpy(str, PyBytes_AsString(repunicode), repsize);
6717 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006719 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006720 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006721 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006722
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 /* need more space? (at least enough for what we
6724 have+the replacement+the rest of the string, so
6725 we won't have to check space for encodable characters) */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006727 if (repsize > 1) {
6728 str = _PyBytesWriter_Prepare(&writer, str, repsize-1);
6729 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006732
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 /* check if there is anything unencodable in the replacement
6734 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 for (i = 0; repsize-->0; ++i, ++str) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006736 ch = PyUnicode_READ_CHAR(repunicode, i);
6737 if (ch >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006738 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006739 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 Py_DECREF(repunicode);
6741 goto onError;
6742 }
Victor Stinner0030cd52015-09-24 14:45:00 +02006743 *str = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006745 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006746 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006747 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006748
6749 /* If overallocation was disabled, ensure that it was the last
6750 write. Otherwise, we missed an optimization */
6751 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006752 }
6753 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006754
Victor Stinner50149202015-09-22 00:26:54 +02006755 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006757 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006758
6759 onError:
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006760 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006761 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006762 Py_XDECREF(exc);
6763 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764}
6765
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006767PyObject *
6768PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006769 Py_ssize_t size,
6770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772 PyObject *result;
6773 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6774 if (unicode == NULL)
6775 return NULL;
6776 result = unicode_encode_ucs1(unicode, errors, 256);
6777 Py_DECREF(unicode);
6778 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779}
6780
Alexander Belopolsky40018472011-02-26 01:02:56 +00006781PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006782_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783{
6784 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 PyErr_BadArgument();
6786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006788 if (PyUnicode_READY(unicode) == -1)
6789 return NULL;
6790 /* Fast path: if it is a one-byte string, construct
6791 bytes object directly. */
6792 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6793 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6794 PyUnicode_GET_LENGTH(unicode));
6795 /* Non-Latin-1 characters present. Defer to above function to
6796 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006797 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006798}
6799
6800PyObject*
6801PyUnicode_AsLatin1String(PyObject *unicode)
6802{
6803 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
6806/* --- 7-bit ASCII Codec -------------------------------------------------- */
6807
Alexander Belopolsky40018472011-02-26 01:02:56 +00006808PyObject *
6809PyUnicode_DecodeASCII(const char *s,
6810 Py_ssize_t size,
6811 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006813 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006814 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006815 int kind;
6816 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006817 Py_ssize_t startinpos;
6818 Py_ssize_t endinpos;
6819 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006821 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006823 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006824
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006826 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006829 if (size == 1 && (unsigned char)s[0] < 128)
6830 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006831
Victor Stinner8f674cc2013-04-17 23:02:17 +02006832 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006833 writer.min_length = size;
6834 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006835 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006838 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006839 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006840 writer.pos = outpos;
6841 if (writer.pos == size)
6842 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006843
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006844 s += writer.pos;
6845 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006847 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006849 PyUnicode_WRITE(kind, data, writer.pos, c);
6850 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006852 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006854
6855 /* byte outsize range 0x00..0x7f: call the error handler */
6856
6857 if (error_handler == _Py_ERROR_UNKNOWN)
6858 error_handler = get_error_handler(errors);
6859
6860 switch (error_handler)
6861 {
6862 case _Py_ERROR_REPLACE:
6863 case _Py_ERROR_SURROGATEESCAPE:
6864 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006865 but we may switch to UCS2 at the first write */
6866 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6867 goto onError;
6868 kind = writer.kind;
6869 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006870
6871 if (error_handler == _Py_ERROR_REPLACE)
6872 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6873 else
6874 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6875 writer.pos++;
6876 ++s;
6877 break;
6878
6879 case _Py_ERROR_IGNORE:
6880 ++s;
6881 break;
6882
6883 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 startinpos = s-starts;
6885 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006886 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006887 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 "ascii", "ordinal not in range(128)",
6889 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006890 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006892 kind = writer.kind;
6893 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006896 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006897 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006898 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006899
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006901 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006902 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 return NULL;
6905}
6906
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006907/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006908PyObject *
6909PyUnicode_EncodeASCII(const Py_UNICODE *p,
6910 Py_ssize_t size,
6911 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 PyObject *result;
6914 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6915 if (unicode == NULL)
6916 return NULL;
6917 result = unicode_encode_ucs1(unicode, errors, 128);
6918 Py_DECREF(unicode);
6919 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006923_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924{
6925 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 PyErr_BadArgument();
6927 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006929 if (PyUnicode_READY(unicode) == -1)
6930 return NULL;
6931 /* Fast path: if it is an ASCII-only string, construct bytes object
6932 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006933 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006934 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6935 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006936 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006937}
6938
6939PyObject *
6940PyUnicode_AsASCIIString(PyObject *unicode)
6941{
6942 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943}
6944
Victor Stinner99b95382011-07-04 14:23:54 +02006945#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006947/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006948
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006949#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950#define NEED_RETRY
6951#endif
6952
Victor Stinner3a50e702011-10-18 21:21:00 +02006953#ifndef WC_ERR_INVALID_CHARS
6954# define WC_ERR_INVALID_CHARS 0x0080
6955#endif
6956
6957static char*
6958code_page_name(UINT code_page, PyObject **obj)
6959{
6960 *obj = NULL;
6961 if (code_page == CP_ACP)
6962 return "mbcs";
6963 if (code_page == CP_UTF7)
6964 return "CP_UTF7";
6965 if (code_page == CP_UTF8)
6966 return "CP_UTF8";
6967
6968 *obj = PyBytes_FromFormat("cp%u", code_page);
6969 if (*obj == NULL)
6970 return NULL;
6971 return PyBytes_AS_STRING(*obj);
6972}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973
Victor Stinner3a50e702011-10-18 21:21:00 +02006974static DWORD
6975decode_code_page_flags(UINT code_page)
6976{
6977 if (code_page == CP_UTF7) {
6978 /* The CP_UTF7 decoder only supports flags=0 */
6979 return 0;
6980 }
6981 else
6982 return MB_ERR_INVALID_CHARS;
6983}
6984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006985/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 * Decode a byte string from a Windows code page into unicode object in strict
6987 * mode.
6988 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006989 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6990 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006991 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006992static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006993decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006994 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006995 const char *in,
6996 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997{
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006999 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001
7002 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007003 assert(insize > 0);
7004 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7005 if (outsize <= 0)
7006 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007
7008 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007010 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007011 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 if (*v == NULL)
7013 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015 }
7016 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007019 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022 }
7023
7024 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7026 if (outsize <= 0)
7027 goto error;
7028 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007029
Victor Stinner3a50e702011-10-18 21:21:00 +02007030error:
7031 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7032 return -2;
7033 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007034 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035}
7036
Victor Stinner3a50e702011-10-18 21:21:00 +02007037/*
7038 * Decode a byte string from a code page into unicode object with an error
7039 * handler.
7040 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007041 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 * UnicodeDecodeError exception and returns -1 on error.
7043 */
7044static int
7045decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007046 PyObject **v,
7047 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007048 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007049{
7050 const char *startin = in;
7051 const char *endin = in + size;
7052 const DWORD flags = decode_code_page_flags(code_page);
7053 /* Ideally, we should get reason from FormatMessage. This is the Windows
7054 2000 English version of the message. */
7055 const char *reason = "No mapping for the Unicode character exists "
7056 "in the target code page.";
7057 /* each step cannot decode more than 1 character, but a character can be
7058 represented as a surrogate pair */
7059 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007060 int insize;
7061 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007062 PyObject *errorHandler = NULL;
7063 PyObject *exc = NULL;
7064 PyObject *encoding_obj = NULL;
7065 char *encoding;
7066 DWORD err;
7067 int ret = -1;
7068
7069 assert(size > 0);
7070
7071 encoding = code_page_name(code_page, &encoding_obj);
7072 if (encoding == NULL)
7073 return -1;
7074
Victor Stinner7d00cc12014-03-17 23:08:06 +01007075 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7077 UnicodeDecodeError. */
7078 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7079 if (exc != NULL) {
7080 PyCodec_StrictErrors(exc);
7081 Py_CLEAR(exc);
7082 }
7083 goto error;
7084 }
7085
7086 if (*v == NULL) {
7087 /* Create unicode object */
7088 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7089 PyErr_NoMemory();
7090 goto error;
7091 }
Victor Stinnerab595942011-12-17 04:59:06 +01007092 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007093 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 if (*v == NULL)
7095 goto error;
7096 startout = PyUnicode_AS_UNICODE(*v);
7097 }
7098 else {
7099 /* Extend unicode object */
7100 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7101 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7102 PyErr_NoMemory();
7103 goto error;
7104 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007105 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 goto error;
7107 startout = PyUnicode_AS_UNICODE(*v) + n;
7108 }
7109
7110 /* Decode the byte string character per character */
7111 out = startout;
7112 while (in < endin)
7113 {
7114 /* Decode a character */
7115 insize = 1;
7116 do
7117 {
7118 outsize = MultiByteToWideChar(code_page, flags,
7119 in, insize,
7120 buffer, Py_ARRAY_LENGTH(buffer));
7121 if (outsize > 0)
7122 break;
7123 err = GetLastError();
7124 if (err != ERROR_NO_UNICODE_TRANSLATION
7125 && err != ERROR_INSUFFICIENT_BUFFER)
7126 {
7127 PyErr_SetFromWindowsErr(0);
7128 goto error;
7129 }
7130 insize++;
7131 }
7132 /* 4=maximum length of a UTF-8 sequence */
7133 while (insize <= 4 && (in + insize) <= endin);
7134
7135 if (outsize <= 0) {
7136 Py_ssize_t startinpos, endinpos, outpos;
7137
Victor Stinner7d00cc12014-03-17 23:08:06 +01007138 /* last character in partial decode? */
7139 if (in + insize >= endin && !final)
7140 break;
7141
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 startinpos = in - startin;
7143 endinpos = startinpos + 1;
7144 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007145 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 errors, &errorHandler,
7147 encoding, reason,
7148 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007149 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 {
7151 goto error;
7152 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007153 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 }
7155 else {
7156 in += insize;
7157 memcpy(out, buffer, outsize * sizeof(wchar_t));
7158 out += outsize;
7159 }
7160 }
7161
7162 /* write a NUL character at the end */
7163 *out = 0;
7164
7165 /* Extend unicode object */
7166 outsize = out - startout;
7167 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007168 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007170 /* (in - startin) <= size and size is an int */
7171 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007172
7173error:
7174 Py_XDECREF(encoding_obj);
7175 Py_XDECREF(errorHandler);
7176 Py_XDECREF(exc);
7177 return ret;
7178}
7179
Victor Stinner3a50e702011-10-18 21:21:00 +02007180static PyObject *
7181decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007182 const char *s, Py_ssize_t size,
7183 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184{
Victor Stinner76a31a62011-11-04 00:05:13 +01007185 PyObject *v = NULL;
7186 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 if (code_page < 0) {
7189 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7190 return NULL;
7191 }
7192
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007195
Victor Stinner76a31a62011-11-04 00:05:13 +01007196 do
7197 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007198#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 if (size > INT_MAX) {
7200 chunk_size = INT_MAX;
7201 final = 0;
7202 done = 0;
7203 }
7204 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007206 {
7207 chunk_size = (int)size;
7208 final = (consumed == NULL);
7209 done = 1;
7210 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007211
Victor Stinner76a31a62011-11-04 00:05:13 +01007212 if (chunk_size == 0 && done) {
7213 if (v != NULL)
7214 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007215 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007216 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
Victor Stinner76a31a62011-11-04 00:05:13 +01007218 converted = decode_code_page_strict(code_page, &v,
7219 s, chunk_size);
7220 if (converted == -2)
7221 converted = decode_code_page_errors(code_page, &v,
7222 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007223 errors, final);
7224 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007225
7226 if (converted < 0) {
7227 Py_XDECREF(v);
7228 return NULL;
7229 }
7230
7231 if (consumed)
7232 *consumed += converted;
7233
7234 s += converted;
7235 size -= converted;
7236 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007237
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007238 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239}
7240
Alexander Belopolsky40018472011-02-26 01:02:56 +00007241PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007242PyUnicode_DecodeCodePageStateful(int code_page,
7243 const char *s,
7244 Py_ssize_t size,
7245 const char *errors,
7246 Py_ssize_t *consumed)
7247{
7248 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7249}
7250
7251PyObject *
7252PyUnicode_DecodeMBCSStateful(const char *s,
7253 Py_ssize_t size,
7254 const char *errors,
7255 Py_ssize_t *consumed)
7256{
7257 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7258}
7259
7260PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261PyUnicode_DecodeMBCS(const char *s,
7262 Py_ssize_t size,
7263 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007264{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7266}
7267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268static DWORD
7269encode_code_page_flags(UINT code_page, const char *errors)
7270{
7271 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007272 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 }
7274 else if (code_page == CP_UTF7) {
7275 /* CP_UTF7 only supports flags=0 */
7276 return 0;
7277 }
7278 else {
7279 if (errors != NULL && strcmp(errors, "replace") == 0)
7280 return 0;
7281 else
7282 return WC_NO_BEST_FIT_CHARS;
7283 }
7284}
7285
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 * Encode a Unicode string to a Windows code page into a byte string in strict
7288 * mode.
7289 *
7290 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007291 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007292 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007293static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007294encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007295 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297{
Victor Stinner554f3f02010-06-16 23:33:54 +00007298 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 BOOL *pusedDefaultChar = &usedDefaultChar;
7300 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007301 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007302 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007303 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 const DWORD flags = encode_code_page_flags(code_page, NULL);
7305 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007306 /* Create a substring so that we can get the UTF-16 representation
7307 of just the slice under consideration. */
7308 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007313 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007315 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007316
Victor Stinner2fc507f2011-11-04 20:06:39 +01007317 substring = PyUnicode_Substring(unicode, offset, offset+len);
7318 if (substring == NULL)
7319 return -1;
7320 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7321 if (p == NULL) {
7322 Py_DECREF(substring);
7323 return -1;
7324 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007325 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007326
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007327 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007329 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 NULL, 0,
7331 NULL, pusedDefaultChar);
7332 if (outsize <= 0)
7333 goto error;
7334 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007335 if (pusedDefaultChar && *pusedDefaultChar) {
7336 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007338 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007339
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007343 if (*outbytes == NULL) {
7344 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007346 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348 }
7349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 const Py_ssize_t n = PyBytes_Size(*outbytes);
7352 if (outsize > PY_SSIZE_T_MAX - n) {
7353 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007354 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007357 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7358 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007360 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362 }
7363
7364 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007366 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 out, outsize,
7368 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007369 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 if (outsize <= 0)
7371 goto error;
7372 if (pusedDefaultChar && *pusedDefaultChar)
7373 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007375
Victor Stinner3a50e702011-10-18 21:21:00 +02007376error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007377 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7379 return -2;
7380 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007381 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007382}
7383
Victor Stinner3a50e702011-10-18 21:21:00 +02007384/*
7385 * Encode a Unicode string to a Windows code page into a byte string using a
7386 * error handler.
7387 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007388 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 * -1 on other error.
7390 */
7391static int
7392encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007393 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007394 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007395{
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007397 Py_ssize_t pos = unicode_offset;
7398 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 /* Ideally, we should get reason from FormatMessage. This is the Windows
7400 2000 English version of the message. */
7401 const char *reason = "invalid character";
7402 /* 4=maximum length of a UTF-8 sequence */
7403 char buffer[4];
7404 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7405 Py_ssize_t outsize;
7406 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 PyObject *errorHandler = NULL;
7408 PyObject *exc = NULL;
7409 PyObject *encoding_obj = NULL;
7410 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007411 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 PyObject *rep;
7413 int ret = -1;
7414
7415 assert(insize > 0);
7416
7417 encoding = code_page_name(code_page, &encoding_obj);
7418 if (encoding == NULL)
7419 return -1;
7420
7421 if (errors == NULL || strcmp(errors, "strict") == 0) {
7422 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7423 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007424 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 if (exc != NULL) {
7426 PyCodec_StrictErrors(exc);
7427 Py_DECREF(exc);
7428 }
7429 Py_XDECREF(encoding_obj);
7430 return -1;
7431 }
7432
7433 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7434 pusedDefaultChar = &usedDefaultChar;
7435 else
7436 pusedDefaultChar = NULL;
7437
7438 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7439 PyErr_NoMemory();
7440 goto error;
7441 }
7442 outsize = insize * Py_ARRAY_LENGTH(buffer);
7443
7444 if (*outbytes == NULL) {
7445 /* Create string object */
7446 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7447 if (*outbytes == NULL)
7448 goto error;
7449 out = PyBytes_AS_STRING(*outbytes);
7450 }
7451 else {
7452 /* Extend string object */
7453 Py_ssize_t n = PyBytes_Size(*outbytes);
7454 if (n > PY_SSIZE_T_MAX - outsize) {
7455 PyErr_NoMemory();
7456 goto error;
7457 }
7458 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7459 goto error;
7460 out = PyBytes_AS_STRING(*outbytes) + n;
7461 }
7462
7463 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007464 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7467 wchar_t chars[2];
7468 int charsize;
7469 if (ch < 0x10000) {
7470 chars[0] = (wchar_t)ch;
7471 charsize = 1;
7472 }
7473 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007474 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7475 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 charsize = 2;
7477 }
7478
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007480 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 buffer, Py_ARRAY_LENGTH(buffer),
7482 NULL, pusedDefaultChar);
7483 if (outsize > 0) {
7484 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7485 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 memcpy(out, buffer, outsize);
7488 out += outsize;
7489 continue;
7490 }
7491 }
7492 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7493 PyErr_SetFromWindowsErr(0);
7494 goto error;
7495 }
7496
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 rep = unicode_encode_call_errorhandler(
7498 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007499 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 if (rep == NULL)
7502 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007503 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007504
7505 if (PyBytes_Check(rep)) {
7506 outsize = PyBytes_GET_SIZE(rep);
7507 if (outsize != 1) {
7508 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7509 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7510 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7511 Py_DECREF(rep);
7512 goto error;
7513 }
7514 out = PyBytes_AS_STRING(*outbytes) + offset;
7515 }
7516 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7517 out += outsize;
7518 }
7519 else {
7520 Py_ssize_t i;
7521 enum PyUnicode_Kind kind;
7522 void *data;
7523
Benjamin Petersonbac79492012-01-14 13:34:47 -05007524 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 Py_DECREF(rep);
7526 goto error;
7527 }
7528
7529 outsize = PyUnicode_GET_LENGTH(rep);
7530 if (outsize != 1) {
7531 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7532 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7533 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7534 Py_DECREF(rep);
7535 goto error;
7536 }
7537 out = PyBytes_AS_STRING(*outbytes) + offset;
7538 }
7539 kind = PyUnicode_KIND(rep);
7540 data = PyUnicode_DATA(rep);
7541 for (i=0; i < outsize; i++) {
7542 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7543 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007544 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007545 encoding, unicode,
7546 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 "unable to encode error handler result to ASCII");
7548 Py_DECREF(rep);
7549 goto error;
7550 }
7551 *out = (unsigned char)ch;
7552 out++;
7553 }
7554 }
7555 Py_DECREF(rep);
7556 }
7557 /* write a NUL byte */
7558 *out = 0;
7559 outsize = out - PyBytes_AS_STRING(*outbytes);
7560 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7561 if (_PyBytes_Resize(outbytes, outsize) < 0)
7562 goto error;
7563 ret = 0;
7564
7565error:
7566 Py_XDECREF(encoding_obj);
7567 Py_XDECREF(errorHandler);
7568 Py_XDECREF(exc);
7569 return ret;
7570}
7571
Victor Stinner3a50e702011-10-18 21:21:00 +02007572static PyObject *
7573encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007574 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 const char *errors)
7576{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007577 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007579 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007580 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007581
Victor Stinner29dacf22015-01-26 16:41:32 +01007582 if (!PyUnicode_Check(unicode)) {
7583 PyErr_BadArgument();
7584 return NULL;
7585 }
7586
Benjamin Petersonbac79492012-01-14 13:34:47 -05007587 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007588 return NULL;
7589 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007590
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 if (code_page < 0) {
7592 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7593 return NULL;
7594 }
7595
Martin v. Löwis3d325192011-11-04 18:23:06 +01007596 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007597 return PyBytes_FromStringAndSize(NULL, 0);
7598
Victor Stinner7581cef2011-11-03 22:32:33 +01007599 offset = 0;
7600 do
7601 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007603 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007604 chunks. */
7605 if (len > INT_MAX/2) {
7606 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007607 done = 0;
7608 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007609 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007610#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007611 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007612 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007613 done = 1;
7614 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007615
Victor Stinner76a31a62011-11-04 00:05:13 +01007616 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007617 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007618 errors);
7619 if (ret == -2)
7620 ret = encode_code_page_errors(code_page, &outbytes,
7621 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007623 if (ret < 0) {
7624 Py_XDECREF(outbytes);
7625 return NULL;
7626 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007627
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007630 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 return outbytes;
7633}
7634
7635PyObject *
7636PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7637 Py_ssize_t size,
7638 const char *errors)
7639{
Victor Stinner7581cef2011-11-03 22:32:33 +01007640 PyObject *unicode, *res;
7641 unicode = PyUnicode_FromUnicode(p, size);
7642 if (unicode == NULL)
7643 return NULL;
7644 res = encode_code_page(CP_ACP, unicode, errors);
7645 Py_DECREF(unicode);
7646 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647}
7648
7649PyObject *
7650PyUnicode_EncodeCodePage(int code_page,
7651 PyObject *unicode,
7652 const char *errors)
7653{
Victor Stinner7581cef2011-11-03 22:32:33 +01007654 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007655}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007656
Alexander Belopolsky40018472011-02-26 01:02:56 +00007657PyObject *
7658PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007659{
Victor Stinner7581cef2011-11-03 22:32:33 +01007660 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007661}
7662
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007663#undef NEED_RETRY
7664
Victor Stinner99b95382011-07-04 14:23:54 +02007665#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007666
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667/* --- Character Mapping Codec -------------------------------------------- */
7668
Victor Stinnerfb161b12013-04-18 01:44:27 +02007669static int
7670charmap_decode_string(const char *s,
7671 Py_ssize_t size,
7672 PyObject *mapping,
7673 const char *errors,
7674 _PyUnicodeWriter *writer)
7675{
7676 const char *starts = s;
7677 const char *e;
7678 Py_ssize_t startinpos, endinpos;
7679 PyObject *errorHandler = NULL, *exc = NULL;
7680 Py_ssize_t maplen;
7681 enum PyUnicode_Kind mapkind;
7682 void *mapdata;
7683 Py_UCS4 x;
7684 unsigned char ch;
7685
7686 if (PyUnicode_READY(mapping) == -1)
7687 return -1;
7688
7689 maplen = PyUnicode_GET_LENGTH(mapping);
7690 mapdata = PyUnicode_DATA(mapping);
7691 mapkind = PyUnicode_KIND(mapping);
7692
7693 e = s + size;
7694
7695 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7696 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7697 * is disabled in encoding aliases, latin1 is preferred because
7698 * its implementation is faster. */
7699 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7700 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7701 Py_UCS4 maxchar = writer->maxchar;
7702
7703 assert (writer->kind == PyUnicode_1BYTE_KIND);
7704 while (s < e) {
7705 ch = *s;
7706 x = mapdata_ucs1[ch];
7707 if (x > maxchar) {
7708 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7709 goto onError;
7710 maxchar = writer->maxchar;
7711 outdata = (Py_UCS1 *)writer->data;
7712 }
7713 outdata[writer->pos] = x;
7714 writer->pos++;
7715 ++s;
7716 }
7717 return 0;
7718 }
7719
7720 while (s < e) {
7721 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7722 enum PyUnicode_Kind outkind = writer->kind;
7723 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7724 if (outkind == PyUnicode_1BYTE_KIND) {
7725 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7726 Py_UCS4 maxchar = writer->maxchar;
7727 while (s < e) {
7728 ch = *s;
7729 x = mapdata_ucs2[ch];
7730 if (x > maxchar)
7731 goto Error;
7732 outdata[writer->pos] = x;
7733 writer->pos++;
7734 ++s;
7735 }
7736 break;
7737 }
7738 else if (outkind == PyUnicode_2BYTE_KIND) {
7739 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7740 while (s < e) {
7741 ch = *s;
7742 x = mapdata_ucs2[ch];
7743 if (x == 0xFFFE)
7744 goto Error;
7745 outdata[writer->pos] = x;
7746 writer->pos++;
7747 ++s;
7748 }
7749 break;
7750 }
7751 }
7752 ch = *s;
7753
7754 if (ch < maplen)
7755 x = PyUnicode_READ(mapkind, mapdata, ch);
7756 else
7757 x = 0xfffe; /* invalid value */
7758Error:
7759 if (x == 0xfffe)
7760 {
7761 /* undefined mapping */
7762 startinpos = s-starts;
7763 endinpos = startinpos+1;
7764 if (unicode_decode_call_errorhandler_writer(
7765 errors, &errorHandler,
7766 "charmap", "character maps to <undefined>",
7767 &starts, &e, &startinpos, &endinpos, &exc, &s,
7768 writer)) {
7769 goto onError;
7770 }
7771 continue;
7772 }
7773
7774 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7775 goto onError;
7776 ++s;
7777 }
7778 Py_XDECREF(errorHandler);
7779 Py_XDECREF(exc);
7780 return 0;
7781
7782onError:
7783 Py_XDECREF(errorHandler);
7784 Py_XDECREF(exc);
7785 return -1;
7786}
7787
7788static int
7789charmap_decode_mapping(const char *s,
7790 Py_ssize_t size,
7791 PyObject *mapping,
7792 const char *errors,
7793 _PyUnicodeWriter *writer)
7794{
7795 const char *starts = s;
7796 const char *e;
7797 Py_ssize_t startinpos, endinpos;
7798 PyObject *errorHandler = NULL, *exc = NULL;
7799 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007800 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007801
7802 e = s + size;
7803
7804 while (s < e) {
7805 ch = *s;
7806
7807 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7808 key = PyLong_FromLong((long)ch);
7809 if (key == NULL)
7810 goto onError;
7811
7812 item = PyObject_GetItem(mapping, key);
7813 Py_DECREF(key);
7814 if (item == NULL) {
7815 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7816 /* No mapping found means: mapping is undefined. */
7817 PyErr_Clear();
7818 goto Undefined;
7819 } else
7820 goto onError;
7821 }
7822
7823 /* Apply mapping */
7824 if (item == Py_None)
7825 goto Undefined;
7826 if (PyLong_Check(item)) {
7827 long value = PyLong_AS_LONG(item);
7828 if (value == 0xFFFE)
7829 goto Undefined;
7830 if (value < 0 || value > MAX_UNICODE) {
7831 PyErr_Format(PyExc_TypeError,
7832 "character mapping must be in range(0x%lx)",
7833 (unsigned long)MAX_UNICODE + 1);
7834 goto onError;
7835 }
7836
7837 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7838 goto onError;
7839 }
7840 else if (PyUnicode_Check(item)) {
7841 if (PyUnicode_READY(item) == -1)
7842 goto onError;
7843 if (PyUnicode_GET_LENGTH(item) == 1) {
7844 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7845 if (value == 0xFFFE)
7846 goto Undefined;
7847 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7848 goto onError;
7849 }
7850 else {
7851 writer->overallocate = 1;
7852 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7853 goto onError;
7854 }
7855 }
7856 else {
7857 /* wrong return value */
7858 PyErr_SetString(PyExc_TypeError,
7859 "character mapping must return integer, None or str");
7860 goto onError;
7861 }
7862 Py_CLEAR(item);
7863 ++s;
7864 continue;
7865
7866Undefined:
7867 /* undefined mapping */
7868 Py_CLEAR(item);
7869 startinpos = s-starts;
7870 endinpos = startinpos+1;
7871 if (unicode_decode_call_errorhandler_writer(
7872 errors, &errorHandler,
7873 "charmap", "character maps to <undefined>",
7874 &starts, &e, &startinpos, &endinpos, &exc, &s,
7875 writer)) {
7876 goto onError;
7877 }
7878 }
7879 Py_XDECREF(errorHandler);
7880 Py_XDECREF(exc);
7881 return 0;
7882
7883onError:
7884 Py_XDECREF(item);
7885 Py_XDECREF(errorHandler);
7886 Py_XDECREF(exc);
7887 return -1;
7888}
7889
Alexander Belopolsky40018472011-02-26 01:02:56 +00007890PyObject *
7891PyUnicode_DecodeCharmap(const char *s,
7892 Py_ssize_t size,
7893 PyObject *mapping,
7894 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007896 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007897
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 /* Default to Latin-1 */
7899 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007903 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007904 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007905 writer.min_length = size;
7906 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007908
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007909 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007910 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7911 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007912 }
7913 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007914 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7915 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007917 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007918
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007920 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921 return NULL;
7922}
7923
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924/* Charmap encoding: the lookup table */
7925
Alexander Belopolsky40018472011-02-26 01:02:56 +00007926struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 PyObject_HEAD
7928 unsigned char level1[32];
7929 int count2, count3;
7930 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007931};
7932
7933static PyObject*
7934encoding_map_size(PyObject *obj, PyObject* args)
7935{
7936 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007937 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007939}
7940
7941static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007942 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 PyDoc_STR("Return the size (in bytes) of this object") },
7944 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945};
7946
7947static void
7948encoding_map_dealloc(PyObject* o)
7949{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007950 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951}
7952
7953static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007954 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 "EncodingMap", /*tp_name*/
7956 sizeof(struct encoding_map), /*tp_basicsize*/
7957 0, /*tp_itemsize*/
7958 /* methods */
7959 encoding_map_dealloc, /*tp_dealloc*/
7960 0, /*tp_print*/
7961 0, /*tp_getattr*/
7962 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007963 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 0, /*tp_repr*/
7965 0, /*tp_as_number*/
7966 0, /*tp_as_sequence*/
7967 0, /*tp_as_mapping*/
7968 0, /*tp_hash*/
7969 0, /*tp_call*/
7970 0, /*tp_str*/
7971 0, /*tp_getattro*/
7972 0, /*tp_setattro*/
7973 0, /*tp_as_buffer*/
7974 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7975 0, /*tp_doc*/
7976 0, /*tp_traverse*/
7977 0, /*tp_clear*/
7978 0, /*tp_richcompare*/
7979 0, /*tp_weaklistoffset*/
7980 0, /*tp_iter*/
7981 0, /*tp_iternext*/
7982 encoding_map_methods, /*tp_methods*/
7983 0, /*tp_members*/
7984 0, /*tp_getset*/
7985 0, /*tp_base*/
7986 0, /*tp_dict*/
7987 0, /*tp_descr_get*/
7988 0, /*tp_descr_set*/
7989 0, /*tp_dictoffset*/
7990 0, /*tp_init*/
7991 0, /*tp_alloc*/
7992 0, /*tp_new*/
7993 0, /*tp_free*/
7994 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007995};
7996
7997PyObject*
7998PyUnicode_BuildEncodingMap(PyObject* string)
7999{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000 PyObject *result;
8001 struct encoding_map *mresult;
8002 int i;
8003 int need_dict = 0;
8004 unsigned char level1[32];
8005 unsigned char level2[512];
8006 unsigned char *mlevel1, *mlevel2, *mlevel3;
8007 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008008 int kind;
8009 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008010 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008011 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008012
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008013 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008014 PyErr_BadArgument();
8015 return NULL;
8016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008017 kind = PyUnicode_KIND(string);
8018 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008019 length = PyUnicode_GET_LENGTH(string);
8020 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008021 memset(level1, 0xFF, sizeof level1);
8022 memset(level2, 0xFF, sizeof level2);
8023
8024 /* If there isn't a one-to-one mapping of NULL to \0,
8025 or if there are non-BMP characters, we need to use
8026 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008027 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008029 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008031 ch = PyUnicode_READ(kind, data, i);
8032 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 need_dict = 1;
8034 break;
8035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037 /* unmapped character */
8038 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008039 l1 = ch >> 11;
8040 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 if (level1[l1] == 0xFF)
8042 level1[l1] = count2++;
8043 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 }
8046
8047 if (count2 >= 0xFF || count3 >= 0xFF)
8048 need_dict = 1;
8049
8050 if (need_dict) {
8051 PyObject *result = PyDict_New();
8052 PyObject *key, *value;
8053 if (!result)
8054 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008055 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008056 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008057 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058 if (!key || !value)
8059 goto failed1;
8060 if (PyDict_SetItem(result, key, value) == -1)
8061 goto failed1;
8062 Py_DECREF(key);
8063 Py_DECREF(value);
8064 }
8065 return result;
8066 failed1:
8067 Py_XDECREF(key);
8068 Py_XDECREF(value);
8069 Py_DECREF(result);
8070 return NULL;
8071 }
8072
8073 /* Create a three-level trie */
8074 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8075 16*count2 + 128*count3 - 1);
8076 if (!result)
8077 return PyErr_NoMemory();
8078 PyObject_Init(result, &EncodingMapType);
8079 mresult = (struct encoding_map*)result;
8080 mresult->count2 = count2;
8081 mresult->count3 = count3;
8082 mlevel1 = mresult->level1;
8083 mlevel2 = mresult->level23;
8084 mlevel3 = mresult->level23 + 16*count2;
8085 memcpy(mlevel1, level1, 32);
8086 memset(mlevel2, 0xFF, 16*count2);
8087 memset(mlevel3, 0, 128*count3);
8088 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008089 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008091 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8092 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 /* unmapped character */
8094 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008095 o1 = ch>>11;
8096 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 i2 = 16*mlevel1[o1] + o2;
8098 if (mlevel2[i2] == 0xFF)
8099 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008100 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 i3 = 128*mlevel2[i2] + o3;
8102 mlevel3[i3] = i;
8103 }
8104 return result;
8105}
8106
8107static int
Victor Stinner22168992011-11-20 17:09:18 +01008108encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109{
8110 struct encoding_map *map = (struct encoding_map*)mapping;
8111 int l1 = c>>11;
8112 int l2 = (c>>7) & 0xF;
8113 int l3 = c & 0x7F;
8114 int i;
8115
Victor Stinner22168992011-11-20 17:09:18 +01008116 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118 if (c == 0)
8119 return 0;
8120 /* level 1*/
8121 i = map->level1[l1];
8122 if (i == 0xFF) {
8123 return -1;
8124 }
8125 /* level 2*/
8126 i = map->level23[16*i+l2];
8127 if (i == 0xFF) {
8128 return -1;
8129 }
8130 /* level 3 */
8131 i = map->level23[16*map->count2 + 128*i + l3];
8132 if (i == 0) {
8133 return -1;
8134 }
8135 return i;
8136}
8137
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008138/* Lookup the character ch in the mapping. If the character
8139 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008140 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008141static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008142charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143{
Christian Heimes217cfd12007-12-02 14:31:20 +00008144 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 PyObject *x;
8146
8147 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 x = PyObject_GetItem(mapping, w);
8150 Py_DECREF(w);
8151 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8153 /* No mapping found means: mapping is undefined. */
8154 PyErr_Clear();
8155 x = Py_None;
8156 Py_INCREF(x);
8157 return x;
8158 } else
8159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008161 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008163 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 long value = PyLong_AS_LONG(x);
8165 if (value < 0 || value > 255) {
8166 PyErr_SetString(PyExc_TypeError,
8167 "character mapping must be in range(256)");
8168 Py_DECREF(x);
8169 return NULL;
8170 }
8171 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008173 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008175 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 /* wrong return value */
8177 PyErr_Format(PyExc_TypeError,
8178 "character mapping must return integer, bytes or None, not %.400s",
8179 x->ob_type->tp_name);
8180 Py_DECREF(x);
8181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182 }
8183}
8184
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008186charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8189 /* exponentially overallocate to minimize reallocations */
8190 if (requiredsize < 2*outsize)
8191 requiredsize = 2*outsize;
8192 if (_PyBytes_Resize(outobj, requiredsize))
8193 return -1;
8194 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195}
8196
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008199} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008201 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202 space is available. Return a new reference to the object that
8203 was put in the output buffer, or Py_None, if the mapping was undefined
8204 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008205 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008206static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008207charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 PyObject *rep;
8211 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008212 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213
Christian Heimes90aa7642007-12-19 02:45:37 +00008214 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 if (res == -1)
8218 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 if (outsize<requiredsize)
8220 if (charmapencode_resize(outobj, outpos, requiredsize))
8221 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008222 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 outstart[(*outpos)++] = (char)res;
8224 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225 }
8226
8227 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008228 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008230 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 Py_DECREF(rep);
8232 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 if (PyLong_Check(rep)) {
8235 Py_ssize_t requiredsize = *outpos+1;
8236 if (outsize<requiredsize)
8237 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8238 Py_DECREF(rep);
8239 return enc_EXCEPTION;
8240 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008241 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008243 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 else {
8245 const char *repchars = PyBytes_AS_STRING(rep);
8246 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8247 Py_ssize_t requiredsize = *outpos+repsize;
8248 if (outsize<requiredsize)
8249 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8250 Py_DECREF(rep);
8251 return enc_EXCEPTION;
8252 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008253 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 memcpy(outstart + *outpos, repchars, repsize);
8255 *outpos += repsize;
8256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258 Py_DECREF(rep);
8259 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260}
8261
8262/* handle an error in PyUnicode_EncodeCharmap
8263 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008264static int
8265charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008268 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008269 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270{
8271 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008272 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008273 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008274 enum PyUnicode_Kind kind;
8275 void *data;
8276 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008278 Py_ssize_t collstartpos = *inpos;
8279 Py_ssize_t collendpos = *inpos+1;
8280 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 char *encoding = "charmap";
8282 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008283 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008284 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008285 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286
Benjamin Petersonbac79492012-01-14 13:34:47 -05008287 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 return -1;
8289 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 /* find all unencodable characters */
8291 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008292 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008293 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008295 val = encoding_map_lookup(ch, mapping);
8296 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 break;
8298 ++collendpos;
8299 continue;
8300 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008301
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008302 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8303 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 if (rep==NULL)
8305 return -1;
8306 else if (rep!=Py_None) {
8307 Py_DECREF(rep);
8308 break;
8309 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312 }
8313 /* cache callback name lookup
8314 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008315 if (*error_handler == _Py_ERROR_UNKNOWN)
8316 *error_handler = get_error_handler(errors);
8317
8318 switch (*error_handler) {
8319 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008320 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008321 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008322
8323 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008324 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 x = charmapencode_output('?', mapping, res, respos);
8326 if (x==enc_EXCEPTION) {
8327 return -1;
8328 }
8329 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008330 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 return -1;
8332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008333 }
8334 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008335 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008336 *inpos = collendpos;
8337 break;
Victor Stinner50149202015-09-22 00:26:54 +02008338
8339 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008340 /* generate replacement (temporarily (mis)uses p) */
8341 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 char buffer[2+29+1+1];
8343 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008344 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 for (cp = buffer; *cp; ++cp) {
8346 x = charmapencode_output(*cp, mapping, res, respos);
8347 if (x==enc_EXCEPTION)
8348 return -1;
8349 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008350 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return -1;
8352 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008353 }
8354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355 *inpos = collendpos;
8356 break;
Victor Stinner50149202015-09-22 00:26:54 +02008357
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358 default:
Victor Stinner50149202015-09-22 00:26:54 +02008359 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008360 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008362 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008364 if (PyBytes_Check(repunicode)) {
8365 /* Directly copy bytes result to output. */
8366 Py_ssize_t outsize = PyBytes_Size(*res);
8367 Py_ssize_t requiredsize;
8368 repsize = PyBytes_Size(repunicode);
8369 requiredsize = *respos + repsize;
8370 if (requiredsize > outsize)
8371 /* Make room for all additional bytes. */
8372 if (charmapencode_resize(res, respos, requiredsize)) {
8373 Py_DECREF(repunicode);
8374 return -1;
8375 }
8376 memcpy(PyBytes_AsString(*res) + *respos,
8377 PyBytes_AsString(repunicode), repsize);
8378 *respos += repsize;
8379 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008380 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008381 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008382 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008384 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008385 Py_DECREF(repunicode);
8386 return -1;
8387 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008388 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008389 data = PyUnicode_DATA(repunicode);
8390 kind = PyUnicode_KIND(repunicode);
8391 for (index = 0; index < repsize; index++) {
8392 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8393 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008395 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
8397 }
8398 else if (x==enc_FAILED) {
8399 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008400 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 return -1;
8402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008403 }
8404 *inpos = newpos;
8405 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 }
8407 return 0;
8408}
8409
Alexander Belopolsky40018472011-02-26 01:02:56 +00008410PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008411_PyUnicode_EncodeCharmap(PyObject *unicode,
8412 PyObject *mapping,
8413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 /* output object */
8416 PyObject *res = NULL;
8417 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008418 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008421 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008422 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008424 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008425 void *data;
8426 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427
Benjamin Petersonbac79492012-01-14 13:34:47 -05008428 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008429 return NULL;
8430 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008431 data = PyUnicode_DATA(unicode);
8432 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008433
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 /* Default to Latin-1 */
8435 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008436 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 /* allocate enough for a simple encoding without
8439 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008440 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 if (res == NULL)
8442 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008443 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008447 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 if (x==enc_EXCEPTION) /* error */
8451 goto onError;
8452 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008453 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008455 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 &res, &respos)) {
8457 goto onError;
8458 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 else
8461 /* done with this character => adjust input position */
8462 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008466 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008467 if (_PyBytes_Resize(&res, respos) < 0)
8468 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008471 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 return res;
8473
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 Py_XDECREF(res);
8476 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008477 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 return NULL;
8479}
8480
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008481/* Deprecated */
8482PyObject *
8483PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8484 Py_ssize_t size,
8485 PyObject *mapping,
8486 const char *errors)
8487{
8488 PyObject *result;
8489 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8490 if (unicode == NULL)
8491 return NULL;
8492 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8493 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008494 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495}
8496
Alexander Belopolsky40018472011-02-26 01:02:56 +00008497PyObject *
8498PyUnicode_AsCharmapString(PyObject *unicode,
8499 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
8501 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 PyErr_BadArgument();
8503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008505 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506}
8507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008509static void
8510make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008512 Py_ssize_t startpos, Py_ssize_t endpos,
8513 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 *exceptionObject = _PyUnicodeTranslateError_Create(
8517 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008518 }
8519 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8521 goto onError;
8522 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8523 goto onError;
8524 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8525 goto onError;
8526 return;
8527 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008528 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 }
8530}
8531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532/* error handling callback helper:
8533 build arguments, call the callback and check the arguments,
8534 put the result into newpos and return the replacement string, which
8535 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008536static PyObject *
8537unicode_translate_call_errorhandler(const char *errors,
8538 PyObject **errorHandler,
8539 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541 Py_ssize_t startpos, Py_ssize_t endpos,
8542 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008544 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008546 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 PyObject *restuple;
8548 PyObject *resunicode;
8549
8550 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 }
8555
8556 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560
8561 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008566 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 Py_DECREF(restuple);
8568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 }
8570 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 &resunicode, &i_newpos)) {
8572 Py_DECREF(restuple);
8573 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008575 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008577 else
8578 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008580 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 Py_DECREF(restuple);
8582 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008583 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 Py_INCREF(resunicode);
8585 Py_DECREF(restuple);
8586 return resunicode;
8587}
8588
8589/* Lookup the character ch in the mapping and put the result in result,
8590 which must be decrefed by the caller.
8591 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008592static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594{
Christian Heimes217cfd12007-12-02 14:31:20 +00008595 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 PyObject *x;
8597
8598 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 x = PyObject_GetItem(mapping, w);
8601 Py_DECREF(w);
8602 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8604 /* No mapping found means: use 1:1 mapping. */
8605 PyErr_Clear();
8606 *result = NULL;
8607 return 0;
8608 } else
8609 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610 }
8611 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 *result = x;
8613 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008614 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008615 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008617 if (value < 0 || value > MAX_UNICODE) {
8618 PyErr_Format(PyExc_ValueError,
8619 "character mapping must be in range(0x%x)",
8620 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 Py_DECREF(x);
8622 return -1;
8623 }
8624 *result = x;
8625 return 0;
8626 }
8627 else if (PyUnicode_Check(x)) {
8628 *result = x;
8629 return 0;
8630 }
8631 else {
8632 /* wrong return value */
8633 PyErr_SetString(PyExc_TypeError,
8634 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008635 Py_DECREF(x);
8636 return -1;
8637 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638}
Victor Stinner1194ea02014-04-04 19:37:40 +02008639
8640/* lookup the character, write the result into the writer.
8641 Return 1 if the result was written into the writer, return 0 if the mapping
8642 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008644charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8645 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646{
Victor Stinner1194ea02014-04-04 19:37:40 +02008647 PyObject *item;
8648
8649 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008651
8652 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008654 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008657 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008659
8660 if (item == Py_None) {
8661 Py_DECREF(item);
8662 return 0;
8663 }
8664
8665 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008666 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8667 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8668 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008669 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8670 Py_DECREF(item);
8671 return -1;
8672 }
8673 Py_DECREF(item);
8674 return 1;
8675 }
8676
8677 if (!PyUnicode_Check(item)) {
8678 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008680 }
8681
8682 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8683 Py_DECREF(item);
8684 return -1;
8685 }
8686
8687 Py_DECREF(item);
8688 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689}
8690
Victor Stinner89a76ab2014-04-05 11:44:04 +02008691static int
8692unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8693 Py_UCS1 *translate)
8694{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008695 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008696 int ret = 0;
8697
Victor Stinner89a76ab2014-04-05 11:44:04 +02008698 if (charmaptranslate_lookup(ch, mapping, &item)) {
8699 return -1;
8700 }
8701
8702 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008703 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008704 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008705 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008706 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008707 /* not found => default to 1:1 mapping */
8708 translate[ch] = ch;
8709 return 1;
8710 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008711 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008712 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008713 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8714 used it */
8715 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008716 /* invalid character or character outside ASCII:
8717 skip the fast translate */
8718 goto exit;
8719 }
8720 translate[ch] = (Py_UCS1)replace;
8721 }
8722 else if (PyUnicode_Check(item)) {
8723 Py_UCS4 replace;
8724
8725 if (PyUnicode_READY(item) == -1) {
8726 Py_DECREF(item);
8727 return -1;
8728 }
8729 if (PyUnicode_GET_LENGTH(item) != 1)
8730 goto exit;
8731
8732 replace = PyUnicode_READ_CHAR(item, 0);
8733 if (replace > 127)
8734 goto exit;
8735 translate[ch] = (Py_UCS1)replace;
8736 }
8737 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008738 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008739 goto exit;
8740 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008741 ret = 1;
8742
Benjamin Peterson1365de72014-04-07 20:15:41 -04008743 exit:
8744 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008745 return ret;
8746}
8747
8748/* Fast path for ascii => ascii translation. Return 1 if the whole string
8749 was translated into writer, return 0 if the input string was partially
8750 translated into writer, raise an exception and return -1 on error. */
8751static int
8752unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008753 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008754{
Victor Stinner872b2912014-04-05 14:27:07 +02008755 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008756 Py_ssize_t len;
8757 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008758 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008759
8760 if (PyUnicode_READY(input) == -1)
8761 return -1;
8762 if (!PyUnicode_IS_ASCII(input))
8763 return 0;
8764 len = PyUnicode_GET_LENGTH(input);
8765
Victor Stinner872b2912014-04-05 14:27:07 +02008766 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008767
8768 in = PyUnicode_1BYTE_DATA(input);
8769 end = in + len;
8770
8771 assert(PyUnicode_IS_ASCII(writer->buffer));
8772 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8773 out = PyUnicode_1BYTE_DATA(writer->buffer);
8774
Victor Stinner872b2912014-04-05 14:27:07 +02008775 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008776 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008777 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008778 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008779 int translate = unicode_fast_translate_lookup(mapping, ch,
8780 ascii_table);
8781 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008782 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008783 if (translate == 0)
8784 goto exit;
8785 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008786 }
Victor Stinner872b2912014-04-05 14:27:07 +02008787 if (ch2 == 0xfe) {
8788 if (ignore)
8789 continue;
8790 goto exit;
8791 }
8792 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008793 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008794 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008795 }
Victor Stinner872b2912014-04-05 14:27:07 +02008796 res = 1;
8797
8798exit:
8799 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8800 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008801}
8802
Victor Stinner3222da22015-10-01 22:07:32 +02008803static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804_PyUnicode_TranslateCharmap(PyObject *input,
8805 PyObject *mapping,
8806 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008809 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 Py_ssize_t size, i;
8811 int kind;
8812 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008813 _PyUnicodeWriter writer;
8814 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008815 char *reason = "character maps to <undefined>";
8816 PyObject *errorHandler = NULL;
8817 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008818 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008819 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 PyErr_BadArgument();
8823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 if (PyUnicode_READY(input) == -1)
8827 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008828 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 kind = PyUnicode_KIND(input);
8830 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831
8832 if (size == 0) {
8833 Py_INCREF(input);
8834 return input;
8835 }
8836
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008837 /* allocate enough for a simple 1:1 translation without
8838 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008839 _PyUnicodeWriter_Init(&writer);
8840 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842
Victor Stinner872b2912014-04-05 14:27:07 +02008843 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8844
8845 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008846 if (res < 0) {
8847 _PyUnicodeWriter_Dealloc(&writer);
8848 return NULL;
8849 }
8850 if (res == 1)
8851 return _PyUnicodeWriter_Finish(&writer);
8852
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008856 int translate;
8857 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8858 Py_ssize_t newpos;
8859 /* startpos for collecting untranslatable chars */
8860 Py_ssize_t collstart;
8861 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008862 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863
Victor Stinner1194ea02014-04-04 19:37:40 +02008864 ch = PyUnicode_READ(kind, data, i);
8865 translate = charmaptranslate_output(ch, mapping, &writer);
8866 if (translate < 0)
8867 goto onError;
8868
8869 if (translate != 0) {
8870 /* it worked => adjust input pointer */
8871 ++i;
8872 continue;
8873 }
8874
8875 /* untranslatable character */
8876 collstart = i;
8877 collend = i+1;
8878
8879 /* find all untranslatable characters */
8880 while (collend < size) {
8881 PyObject *x;
8882 ch = PyUnicode_READ(kind, data, collend);
8883 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008885 Py_XDECREF(x);
8886 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008888 ++collend;
8889 }
8890
8891 if (ignore) {
8892 i = collend;
8893 }
8894 else {
8895 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8896 reason, input, &exc,
8897 collstart, collend, &newpos);
8898 if (repunicode == NULL)
8899 goto onError;
8900 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008902 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008903 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008904 Py_DECREF(repunicode);
8905 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008906 }
8907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 Py_XDECREF(exc);
8909 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008910 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008913 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914 Py_XDECREF(exc);
8915 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916 return NULL;
8917}
8918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919/* Deprecated. Use PyUnicode_Translate instead. */
8920PyObject *
8921PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8922 Py_ssize_t size,
8923 PyObject *mapping,
8924 const char *errors)
8925{
Christian Heimes5f520f42012-09-11 14:03:25 +02008926 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8928 if (!unicode)
8929 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008930 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8931 Py_DECREF(unicode);
8932 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933}
8934
Alexander Belopolsky40018472011-02-26 01:02:56 +00008935PyObject *
8936PyUnicode_Translate(PyObject *str,
8937 PyObject *mapping,
8938 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939{
8940 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008941
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 str = PyUnicode_FromObject(str);
8943 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008944 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 Py_DECREF(str);
8947 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948}
Tim Petersced69f82003-09-16 20:30:58 +00008949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008951fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952{
8953 /* No need to call PyUnicode_READY(self) because this function is only
8954 called as a callback from fixup() which does it already. */
8955 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8956 const int kind = PyUnicode_KIND(self);
8957 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008958 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008959 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 Py_ssize_t i;
8961
8962 for (i = 0; i < len; ++i) {
8963 ch = PyUnicode_READ(kind, data, i);
8964 fixed = 0;
8965 if (ch > 127) {
8966 if (Py_UNICODE_ISSPACE(ch))
8967 fixed = ' ';
8968 else {
8969 const int decimal = Py_UNICODE_TODECIMAL(ch);
8970 if (decimal >= 0)
8971 fixed = '0' + decimal;
8972 }
8973 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008974 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008975 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 PyUnicode_WRITE(kind, data, i, fixed);
8977 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008978 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008979 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 }
8982
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008983 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984}
8985
8986PyObject *
8987_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8988{
8989 if (!PyUnicode_Check(unicode)) {
8990 PyErr_BadInternalCall();
8991 return NULL;
8992 }
8993 if (PyUnicode_READY(unicode) == -1)
8994 return NULL;
8995 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8996 /* If the string is already ASCII, just return the same string */
8997 Py_INCREF(unicode);
8998 return unicode;
8999 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009000 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001}
9002
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009003PyObject *
9004PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9005 Py_ssize_t length)
9006{
Victor Stinnerf0124502011-11-21 23:12:56 +01009007 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009008 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009009 Py_UCS4 maxchar;
9010 enum PyUnicode_Kind kind;
9011 void *data;
9012
Victor Stinner99d7ad02012-02-22 13:37:39 +01009013 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009014 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009015 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009016 if (ch > 127) {
9017 int decimal = Py_UNICODE_TODECIMAL(ch);
9018 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009019 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009020 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009021 }
9022 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009023
9024 /* Copy to a new string */
9025 decimal = PyUnicode_New(length, maxchar);
9026 if (decimal == NULL)
9027 return decimal;
9028 kind = PyUnicode_KIND(decimal);
9029 data = PyUnicode_DATA(decimal);
9030 /* Iterate over code points */
9031 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009032 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009033 if (ch > 127) {
9034 int decimal = Py_UNICODE_TODECIMAL(ch);
9035 if (decimal >= 0)
9036 ch = '0' + decimal;
9037 }
9038 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009040 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009041}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009042/* --- Decimal Encoder ---------------------------------------------------- */
9043
Alexander Belopolsky40018472011-02-26 01:02:56 +00009044int
9045PyUnicode_EncodeDecimal(Py_UNICODE *s,
9046 Py_ssize_t length,
9047 char *output,
9048 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009049{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009050 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009051 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009052 enum PyUnicode_Kind kind;
9053 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009054
9055 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 PyErr_BadArgument();
9057 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009058 }
9059
Victor Stinner42bf7752011-11-21 22:52:58 +01009060 unicode = PyUnicode_FromUnicode(s, length);
9061 if (unicode == NULL)
9062 return -1;
9063
Benjamin Petersonbac79492012-01-14 13:34:47 -05009064 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009065 Py_DECREF(unicode);
9066 return -1;
9067 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009068 kind = PyUnicode_KIND(unicode);
9069 data = PyUnicode_DATA(unicode);
9070
Victor Stinnerb84d7232011-11-22 01:50:07 +01009071 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009072 PyObject *exc;
9073 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009075 Py_ssize_t startpos;
9076
9077 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009078
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009080 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009081 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0) {
9086 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009087 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 continue;
9089 }
9090 if (0 < ch && ch < 256) {
9091 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009092 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 continue;
9094 }
Victor Stinner6345be92011-11-25 20:09:01 +01009095
Victor Stinner42bf7752011-11-21 22:52:58 +01009096 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009097 exc = NULL;
9098 raise_encode_exception(&exc, "decimal", unicode,
9099 startpos, startpos+1,
9100 "invalid decimal Unicode string");
9101 Py_XDECREF(exc);
9102 Py_DECREF(unicode);
9103 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009104 }
9105 /* 0-terminate the output string */
9106 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009107 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009108 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009109}
9110
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111/* --- Helpers ------------------------------------------------------------ */
9112
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009113/* helper macro to fixup start/end slice values */
9114#define ADJUST_INDICES(start, end, len) \
9115 if (end > len) \
9116 end = len; \
9117 else if (end < 0) { \
9118 end += len; \
9119 if (end < 0) \
9120 end = 0; \
9121 } \
9122 if (start < 0) { \
9123 start += len; \
9124 if (start < 0) \
9125 start = 0; \
9126 }
9127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009129any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 Py_ssize_t start,
9131 Py_ssize_t end)
9132{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009133 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 void *buf1, *buf2;
9135 Py_ssize_t len1, len2, result;
9136
9137 kind1 = PyUnicode_KIND(s1);
9138 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009139 if (kind1 < kind2)
9140 return -1;
9141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 len1 = PyUnicode_GET_LENGTH(s1);
9143 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009144 ADJUST_INDICES(start, end, len1);
9145 if (end - start < len2)
9146 return -1;
9147
9148 buf1 = PyUnicode_DATA(s1);
9149 buf2 = PyUnicode_DATA(s2);
9150 if (len2 == 1) {
9151 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9152 result = findchar((const char *)buf1 + kind1*start,
9153 kind1, end - start, ch, direction);
9154 if (result == -1)
9155 return -1;
9156 else
9157 return start + result;
9158 }
9159
9160 if (kind2 != kind1) {
9161 buf2 = _PyUnicode_AsKind(s2, kind1);
9162 if (!buf2)
9163 return -2;
9164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165
Victor Stinner794d5672011-10-10 03:21:36 +02009166 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009167 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009168 case PyUnicode_1BYTE_KIND:
9169 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9170 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9171 else
9172 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9173 break;
9174 case PyUnicode_2BYTE_KIND:
9175 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9176 break;
9177 case PyUnicode_4BYTE_KIND:
9178 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9179 break;
9180 default:
9181 assert(0); result = -2;
9182 }
9183 }
9184 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009185 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009186 case PyUnicode_1BYTE_KIND:
9187 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9188 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9189 else
9190 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9191 break;
9192 case PyUnicode_2BYTE_KIND:
9193 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9194 break;
9195 case PyUnicode_4BYTE_KIND:
9196 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9197 break;
9198 default:
9199 assert(0); result = -2;
9200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 }
9202
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009203 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 PyMem_Free(buf2);
9205
9206 return result;
9207}
9208
9209Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009210_PyUnicode_InsertThousandsGrouping(
9211 PyObject *unicode, Py_ssize_t index,
9212 Py_ssize_t n_buffer,
9213 void *digits, Py_ssize_t n_digits,
9214 Py_ssize_t min_width,
9215 const char *grouping, PyObject *thousands_sep,
9216 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217{
Victor Stinner41a863c2012-02-24 00:37:51 +01009218 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009219 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009220 Py_ssize_t thousands_sep_len;
9221 Py_ssize_t len;
9222
9223 if (unicode != NULL) {
9224 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009225 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009226 }
9227 else {
9228 kind = PyUnicode_1BYTE_KIND;
9229 data = NULL;
9230 }
9231 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9232 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9233 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9234 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009235 if (thousands_sep_kind < kind) {
9236 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9237 if (!thousands_sep_data)
9238 return -1;
9239 }
9240 else {
9241 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9242 if (!data)
9243 return -1;
9244 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009245 }
9246
Benjamin Petersonead6b532011-12-20 17:23:42 -06009247 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009249 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009250 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009251 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009252 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009253 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009254 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009255 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009256 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009257 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009258 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009259 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009261 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009262 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009263 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009264 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009265 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009268 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009269 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009270 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009271 break;
9272 default:
9273 assert(0);
9274 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009276 if (unicode != NULL && thousands_sep_kind != kind) {
9277 if (thousands_sep_kind < kind)
9278 PyMem_Free(thousands_sep_data);
9279 else
9280 PyMem_Free(data);
9281 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009282 if (unicode == NULL) {
9283 *maxchar = 127;
9284 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009285 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009286 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009287 }
9288 }
9289 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290}
9291
9292
Alexander Belopolsky40018472011-02-26 01:02:56 +00009293Py_ssize_t
9294PyUnicode_Count(PyObject *str,
9295 PyObject *substr,
9296 Py_ssize_t start,
9297 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009299 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009300 PyObject* str_obj;
9301 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009302 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 void *buf1 = NULL, *buf2 = NULL;
9304 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009305
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009306 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009307 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009309 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009310 if (!sub_obj) {
9311 Py_DECREF(str_obj);
9312 return -1;
9313 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009314 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009315 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 Py_DECREF(str_obj);
9317 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 }
Tim Petersced69f82003-09-16 20:30:58 +00009319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 kind1 = PyUnicode_KIND(str_obj);
9321 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009322 if (kind1 < kind2) {
9323 Py_DECREF(sub_obj);
9324 Py_DECREF(str_obj);
9325 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009326 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 len1 = PyUnicode_GET_LENGTH(str_obj);
9329 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009331 if (end - start < len2) {
9332 Py_DECREF(sub_obj);
9333 Py_DECREF(str_obj);
9334 return 0;
9335 }
9336
9337 buf1 = PyUnicode_DATA(str_obj);
9338 buf2 = PyUnicode_DATA(sub_obj);
9339 if (kind2 != kind1) {
9340 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9341 if (!buf2)
9342 goto onError;
9343 }
9344
9345 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009347 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9348 result = asciilib_count(
9349 ((Py_UCS1*)buf1) + start, end - start,
9350 buf2, len2, PY_SSIZE_T_MAX
9351 );
9352 else
9353 result = ucs1lib_count(
9354 ((Py_UCS1*)buf1) + start, end - start,
9355 buf2, len2, PY_SSIZE_T_MAX
9356 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 break;
9358 case PyUnicode_2BYTE_KIND:
9359 result = ucs2lib_count(
9360 ((Py_UCS2*)buf1) + start, end - start,
9361 buf2, len2, PY_SSIZE_T_MAX
9362 );
9363 break;
9364 case PyUnicode_4BYTE_KIND:
9365 result = ucs4lib_count(
9366 ((Py_UCS4*)buf1) + start, end - start,
9367 buf2, len2, PY_SSIZE_T_MAX
9368 );
9369 break;
9370 default:
9371 assert(0); result = 0;
9372 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009373
9374 Py_DECREF(sub_obj);
9375 Py_DECREF(str_obj);
9376
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009377 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 PyMem_Free(buf2);
9379
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 onError:
9382 Py_DECREF(sub_obj);
9383 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009384 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 PyMem_Free(buf2);
9386 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387}
9388
Alexander Belopolsky40018472011-02-26 01:02:56 +00009389Py_ssize_t
9390PyUnicode_Find(PyObject *str,
9391 PyObject *sub,
9392 Py_ssize_t start,
9393 Py_ssize_t end,
9394 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009396 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009397
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009399 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009401 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009402 if (!sub) {
9403 Py_DECREF(str);
9404 return -2;
9405 }
9406 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9407 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 Py_DECREF(str);
9409 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410 }
Tim Petersced69f82003-09-16 20:30:58 +00009411
Victor Stinner794d5672011-10-10 03:21:36 +02009412 result = any_find_slice(direction,
9413 str, sub, start, end
9414 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009415
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009417 Py_DECREF(sub);
9418
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 return result;
9420}
9421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422Py_ssize_t
9423PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9424 Py_ssize_t start, Py_ssize_t end,
9425 int direction)
9426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009428 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 if (PyUnicode_READY(str) == -1)
9430 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009431 if (start < 0 || end < 0) {
9432 PyErr_SetString(PyExc_IndexError, "string index out of range");
9433 return -2;
9434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 if (end > PyUnicode_GET_LENGTH(str))
9436 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437 if (start >= end)
9438 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009440 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9441 kind, end-start, ch, direction);
9442 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009444 else
9445 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446}
9447
Alexander Belopolsky40018472011-02-26 01:02:56 +00009448static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009449tailmatch(PyObject *self,
9450 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009451 Py_ssize_t start,
9452 Py_ssize_t end,
9453 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 int kind_self;
9456 int kind_sub;
9457 void *data_self;
9458 void *data_sub;
9459 Py_ssize_t offset;
9460 Py_ssize_t i;
9461 Py_ssize_t end_sub;
9462
9463 if (PyUnicode_READY(self) == -1 ||
9464 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009465 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9468 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009472 if (PyUnicode_GET_LENGTH(substring) == 0)
9473 return 1;
9474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 kind_self = PyUnicode_KIND(self);
9476 data_self = PyUnicode_DATA(self);
9477 kind_sub = PyUnicode_KIND(substring);
9478 data_sub = PyUnicode_DATA(substring);
9479 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9480
9481 if (direction > 0)
9482 offset = end;
9483 else
9484 offset = start;
9485
9486 if (PyUnicode_READ(kind_self, data_self, offset) ==
9487 PyUnicode_READ(kind_sub, data_sub, 0) &&
9488 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9489 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9490 /* If both are of the same kind, memcmp is sufficient */
9491 if (kind_self == kind_sub) {
9492 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009493 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 data_sub,
9495 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009496 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 }
9498 /* otherwise we have to compare each character by first accesing it */
9499 else {
9500 /* We do not need to compare 0 and len(substring)-1 because
9501 the if statement above ensured already that they are equal
9502 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 for (i = 1; i < end_sub; ++i) {
9504 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9505 PyUnicode_READ(kind_sub, data_sub, i))
9506 return 0;
9507 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
9511
9512 return 0;
9513}
9514
Alexander Belopolsky40018472011-02-26 01:02:56 +00009515Py_ssize_t
9516PyUnicode_Tailmatch(PyObject *str,
9517 PyObject *substr,
9518 Py_ssize_t start,
9519 Py_ssize_t end,
9520 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009522 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009523
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524 str = PyUnicode_FromObject(str);
9525 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 substr = PyUnicode_FromObject(substr);
9528 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 Py_DECREF(str);
9530 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 }
Tim Petersced69f82003-09-16 20:30:58 +00009532
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009533 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 Py_DECREF(str);
9536 Py_DECREF(substr);
9537 return result;
9538}
9539
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540/* Apply fixfct filter to the Unicode object self and return a
9541 reference to the modified object */
9542
Alexander Belopolsky40018472011-02-26 01:02:56 +00009543static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009544fixup(PyObject *self,
9545 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 PyObject *u;
9548 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009549 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009551 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009554 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 /* fix functions return the new maximum character in a string,
9557 if the kind of the resulting unicode object does not change,
9558 everything is fine. Otherwise we need to change the string kind
9559 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009560 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009561
9562 if (maxchar_new == 0) {
9563 /* no changes */;
9564 if (PyUnicode_CheckExact(self)) {
9565 Py_DECREF(u);
9566 Py_INCREF(self);
9567 return self;
9568 }
9569 else
9570 return u;
9571 }
9572
Victor Stinnere6abb482012-05-02 01:15:40 +02009573 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574
Victor Stinnereaab6042011-12-11 22:22:39 +01009575 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009577
9578 /* In case the maximum character changed, we need to
9579 convert the string to the new category. */
9580 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9581 if (v == NULL) {
9582 Py_DECREF(u);
9583 return NULL;
9584 }
9585 if (maxchar_new > maxchar_old) {
9586 /* If the maxchar increased so that the kind changed, not all
9587 characters are representable anymore and we need to fix the
9588 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009589 _PyUnicode_FastCopyCharacters(v, 0,
9590 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009591 maxchar_old = fixfct(v);
9592 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 }
9594 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009595 _PyUnicode_FastCopyCharacters(v, 0,
9596 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009598 Py_DECREF(u);
9599 assert(_PyUnicode_CheckConsistency(v, 1));
9600 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601}
9602
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009603static PyObject *
9604ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009606 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9607 char *resdata, *data = PyUnicode_DATA(self);
9608 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009609
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610 res = PyUnicode_New(len, 127);
9611 if (res == NULL)
9612 return NULL;
9613 resdata = PyUnicode_DATA(res);
9614 if (lower)
9615 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617 _Py_bytes_upper(resdata, data, len);
9618 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619}
9620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009622handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009624 Py_ssize_t j;
9625 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009626 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009628
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009629 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9630
9631 where ! is a negation and \p{xxx} is a character with property xxx.
9632 */
9633 for (j = i - 1; j >= 0; j--) {
9634 c = PyUnicode_READ(kind, data, j);
9635 if (!_PyUnicode_IsCaseIgnorable(c))
9636 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9639 if (final_sigma) {
9640 for (j = i + 1; j < length; j++) {
9641 c = PyUnicode_READ(kind, data, j);
9642 if (!_PyUnicode_IsCaseIgnorable(c))
9643 break;
9644 }
9645 final_sigma = j == length || !_PyUnicode_IsCased(c);
9646 }
9647 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009650static int
9651lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9652 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654 /* Obscure special case. */
9655 if (c == 0x3A3) {
9656 mapped[0] = handle_capital_sigma(kind, data, length, i);
9657 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009659 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660}
9661
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662static Py_ssize_t
9663do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009665 Py_ssize_t i, k = 0;
9666 int n_res, j;
9667 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009668
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009669 c = PyUnicode_READ(kind, data, 0);
9670 n_res = _PyUnicode_ToUpperFull(c, mapped);
9671 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009672 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 for (i = 1; i < length; i++) {
9676 c = PyUnicode_READ(kind, data, i);
9677 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9678 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009679 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009681 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009682 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684}
9685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686static Py_ssize_t
9687do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9688 Py_ssize_t i, k = 0;
9689
9690 for (i = 0; i < length; i++) {
9691 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9692 int n_res, j;
9693 if (Py_UNICODE_ISUPPER(c)) {
9694 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9695 }
9696 else if (Py_UNICODE_ISLOWER(c)) {
9697 n_res = _PyUnicode_ToUpperFull(c, mapped);
9698 }
9699 else {
9700 n_res = 1;
9701 mapped[0] = c;
9702 }
9703 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009704 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 res[k++] = mapped[j];
9706 }
9707 }
9708 return k;
9709}
9710
9711static Py_ssize_t
9712do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9713 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009715 Py_ssize_t i, k = 0;
9716
9717 for (i = 0; i < length; i++) {
9718 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9719 int n_res, j;
9720 if (lower)
9721 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9722 else
9723 n_res = _PyUnicode_ToUpperFull(c, mapped);
9724 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009725 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 res[k++] = mapped[j];
9727 }
9728 }
9729 return k;
9730}
9731
9732static Py_ssize_t
9733do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9734{
9735 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9736}
9737
9738static Py_ssize_t
9739do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9740{
9741 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9742}
9743
Benjamin Petersone51757f2012-01-12 21:10:29 -05009744static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009745do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9746{
9747 Py_ssize_t i, k = 0;
9748
9749 for (i = 0; i < length; i++) {
9750 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9751 Py_UCS4 mapped[3];
9752 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9753 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009754 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009755 res[k++] = mapped[j];
9756 }
9757 }
9758 return k;
9759}
9760
9761static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009762do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9763{
9764 Py_ssize_t i, k = 0;
9765 int previous_is_cased;
9766
9767 previous_is_cased = 0;
9768 for (i = 0; i < length; i++) {
9769 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9770 Py_UCS4 mapped[3];
9771 int n_res, j;
9772
9773 if (previous_is_cased)
9774 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9775 else
9776 n_res = _PyUnicode_ToTitleFull(c, mapped);
9777
9778 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009779 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009780 res[k++] = mapped[j];
9781 }
9782
9783 previous_is_cased = _PyUnicode_IsCased(c);
9784 }
9785 return k;
9786}
9787
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788static PyObject *
9789case_operation(PyObject *self,
9790 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9791{
9792 PyObject *res = NULL;
9793 Py_ssize_t length, newlength = 0;
9794 int kind, outkind;
9795 void *data, *outdata;
9796 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9797
Benjamin Petersoneea48462012-01-16 14:28:50 -05009798 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009799
9800 kind = PyUnicode_KIND(self);
9801 data = PyUnicode_DATA(self);
9802 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009803 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009804 PyErr_SetString(PyExc_OverflowError, "string is too long");
9805 return NULL;
9806 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009807 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808 if (tmp == NULL)
9809 return PyErr_NoMemory();
9810 newlength = perform(kind, data, length, tmp, &maxchar);
9811 res = PyUnicode_New(newlength, maxchar);
9812 if (res == NULL)
9813 goto leave;
9814 tmpend = tmp + newlength;
9815 outdata = PyUnicode_DATA(res);
9816 outkind = PyUnicode_KIND(res);
9817 switch (outkind) {
9818 case PyUnicode_1BYTE_KIND:
9819 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9820 break;
9821 case PyUnicode_2BYTE_KIND:
9822 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9823 break;
9824 case PyUnicode_4BYTE_KIND:
9825 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9826 break;
9827 default:
9828 assert(0);
9829 break;
9830 }
9831 leave:
9832 PyMem_FREE(tmp);
9833 return res;
9834}
9835
Tim Peters8ce9f162004-08-27 01:49:32 +00009836PyObject *
9837PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009840 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009842 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009843 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9844 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009845 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009847 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009849 int use_memcpy;
9850 unsigned char *res_data = NULL, *sep_data = NULL;
9851 PyObject *last_obj;
9852 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009854 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009855 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009856 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009857 }
9858
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009859 /* NOTE: the following code can't call back into Python code,
9860 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009861 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009862
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 seqlen = PySequence_Fast_GET_SIZE(fseq);
9864 /* If empty sequence, return u"". */
9865 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009866 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009867 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009868 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009869
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009871 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009872 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009873 if (seqlen == 1) {
9874 if (PyUnicode_CheckExact(items[0])) {
9875 res = items[0];
9876 Py_INCREF(res);
9877 Py_DECREF(fseq);
9878 return res;
9879 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009880 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009881 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009882 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009883 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009884 /* Set up sep and seplen */
9885 if (separator == NULL) {
9886 /* fall back to a blank space separator */
9887 sep = PyUnicode_FromOrdinal(' ');
9888 if (!sep)
9889 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009890 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009891 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009893 else {
9894 if (!PyUnicode_Check(separator)) {
9895 PyErr_Format(PyExc_TypeError,
9896 "separator: expected str instance,"
9897 " %.80s found",
9898 Py_TYPE(separator)->tp_name);
9899 goto onError;
9900 }
9901 if (PyUnicode_READY(separator))
9902 goto onError;
9903 sep = separator;
9904 seplen = PyUnicode_GET_LENGTH(separator);
9905 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9906 /* inc refcount to keep this code path symmetric with the
9907 above case of a blank separator */
9908 Py_INCREF(sep);
9909 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009910 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009911 }
9912
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009913 /* There are at least two things to join, or else we have a subclass
9914 * of str in the sequence.
9915 * Do a pre-pass to figure out the total amount of space we'll
9916 * need (sz), and see whether all argument are strings.
9917 */
9918 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009919#ifdef Py_DEBUG
9920 use_memcpy = 0;
9921#else
9922 use_memcpy = 1;
9923#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 for (i = 0; i < seqlen; i++) {
9925 const Py_ssize_t old_sz = sz;
9926 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 if (!PyUnicode_Check(item)) {
9928 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009929 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009930 " %.80s found",
9931 i, Py_TYPE(item)->tp_name);
9932 goto onError;
9933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 if (PyUnicode_READY(item) == -1)
9935 goto onError;
9936 sz += PyUnicode_GET_LENGTH(item);
9937 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009938 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 if (i != 0)
9940 sz += seplen;
9941 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9942 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009944 goto onError;
9945 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009946 if (use_memcpy && last_obj != NULL) {
9947 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9948 use_memcpy = 0;
9949 }
9950 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009951 }
Tim Petersced69f82003-09-16 20:30:58 +00009952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009954 if (res == NULL)
9955 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009956
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009957 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009958#ifdef Py_DEBUG
9959 use_memcpy = 0;
9960#else
9961 if (use_memcpy) {
9962 res_data = PyUnicode_1BYTE_DATA(res);
9963 kind = PyUnicode_KIND(res);
9964 if (seplen != 0)
9965 sep_data = PyUnicode_1BYTE_DATA(sep);
9966 }
9967#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009968 if (use_memcpy) {
9969 for (i = 0; i < seqlen; ++i) {
9970 Py_ssize_t itemlen;
9971 item = items[i];
9972
9973 /* Copy item, and maybe the separator. */
9974 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009975 Py_MEMCPY(res_data,
9976 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009977 kind * seplen);
9978 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009979 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009980
9981 itemlen = PyUnicode_GET_LENGTH(item);
9982 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009983 Py_MEMCPY(res_data,
9984 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 kind * itemlen);
9986 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009987 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009988 }
9989 assert(res_data == PyUnicode_1BYTE_DATA(res)
9990 + kind * PyUnicode_GET_LENGTH(res));
9991 }
9992 else {
9993 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9994 Py_ssize_t itemlen;
9995 item = items[i];
9996
9997 /* Copy item, and maybe the separator. */
9998 if (i && seplen != 0) {
9999 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10000 res_offset += seplen;
10001 }
10002
10003 itemlen = PyUnicode_GET_LENGTH(item);
10004 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010005 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010006 res_offset += itemlen;
10007 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010008 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010009 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010010 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010011
Tim Peters05eba1f2004-08-27 21:32:02 +000010012 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010014 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016
Benjamin Peterson29060642009-01-31 22:14:21 +000010017 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010018 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010020 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021 return NULL;
10022}
10023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024#define FILL(kind, data, value, start, length) \
10025 do { \
10026 Py_ssize_t i_ = 0; \
10027 assert(kind != PyUnicode_WCHAR_KIND); \
10028 switch ((kind)) { \
10029 case PyUnicode_1BYTE_KIND: { \
10030 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010031 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 break; \
10033 } \
10034 case PyUnicode_2BYTE_KIND: { \
10035 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10036 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10037 break; \
10038 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010039 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10041 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10042 break; \
10043 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010044 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 } \
10046 } while (0)
10047
Victor Stinnerd3f08822012-05-29 12:57:52 +020010048void
10049_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10050 Py_UCS4 fill_char)
10051{
10052 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10053 const void *data = PyUnicode_DATA(unicode);
10054 assert(PyUnicode_IS_READY(unicode));
10055 assert(unicode_modifiable(unicode));
10056 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10057 assert(start >= 0);
10058 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10059 FILL(kind, data, fill_char, start, length);
10060}
10061
Victor Stinner3fe55312012-01-04 00:33:50 +010010062Py_ssize_t
10063PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10064 Py_UCS4 fill_char)
10065{
10066 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010067
10068 if (!PyUnicode_Check(unicode)) {
10069 PyErr_BadInternalCall();
10070 return -1;
10071 }
10072 if (PyUnicode_READY(unicode) == -1)
10073 return -1;
10074 if (unicode_check_modifiable(unicode))
10075 return -1;
10076
Victor Stinnerd3f08822012-05-29 12:57:52 +020010077 if (start < 0) {
10078 PyErr_SetString(PyExc_IndexError, "string index out of range");
10079 return -1;
10080 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010081 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10082 PyErr_SetString(PyExc_ValueError,
10083 "fill character is bigger than "
10084 "the string maximum character");
10085 return -1;
10086 }
10087
10088 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10089 length = Py_MIN(maxlen, length);
10090 if (length <= 0)
10091 return 0;
10092
Victor Stinnerd3f08822012-05-29 12:57:52 +020010093 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010094 return length;
10095}
10096
Victor Stinner9310abb2011-10-05 00:59:23 +020010097static PyObject *
10098pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010099 Py_ssize_t left,
10100 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 PyObject *u;
10104 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010105 int kind;
10106 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
10108 if (left < 0)
10109 left = 0;
10110 if (right < 0)
10111 right = 0;
10112
Victor Stinnerc4b49542011-12-11 22:44:26 +010010113 if (left == 0 && right == 0)
10114 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10117 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010118 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10119 return NULL;
10120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010122 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010124 if (!u)
10125 return NULL;
10126
10127 kind = PyUnicode_KIND(u);
10128 data = PyUnicode_DATA(u);
10129 if (left)
10130 FILL(kind, data, fill, 0, left);
10131 if (right)
10132 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010133 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010134 assert(_PyUnicode_CheckConsistency(u, 1));
10135 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
10137
Alexander Belopolsky40018472011-02-26 01:02:56 +000010138PyObject *
10139PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
10143 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010144 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010145 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010146 if (PyUnicode_READY(string) == -1) {
10147 Py_DECREF(string);
10148 return NULL;
10149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
Benjamin Petersonead6b532011-12-20 17:23:42 -060010151 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010153 if (PyUnicode_IS_ASCII(string))
10154 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010156 PyUnicode_GET_LENGTH(string), keepends);
10157 else
10158 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010160 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 break;
10162 case PyUnicode_2BYTE_KIND:
10163 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010164 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 PyUnicode_GET_LENGTH(string), keepends);
10166 break;
10167 case PyUnicode_4BYTE_KIND:
10168 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 PyUnicode_GET_LENGTH(string), keepends);
10171 break;
10172 default:
10173 assert(0);
10174 list = 0;
10175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 Py_DECREF(string);
10177 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178}
10179
Alexander Belopolsky40018472011-02-26 01:02:56 +000010180static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010181split(PyObject *self,
10182 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010183 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010185 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 void *buf1, *buf2;
10187 Py_ssize_t len1, len2;
10188 PyObject* out;
10189
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010191 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 if (PyUnicode_READY(self) == -1)
10194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010197 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199 if (PyUnicode_IS_ASCII(self))
10200 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010201 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010202 PyUnicode_GET_LENGTH(self), maxcount
10203 );
10204 else
10205 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 PyUnicode_GET_LENGTH(self), maxcount
10208 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 case PyUnicode_2BYTE_KIND:
10210 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010211 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 PyUnicode_GET_LENGTH(self), maxcount
10213 );
10214 case PyUnicode_4BYTE_KIND:
10215 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010216 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 PyUnicode_GET_LENGTH(self), maxcount
10218 );
10219 default:
10220 assert(0);
10221 return NULL;
10222 }
10223
10224 if (PyUnicode_READY(substring) == -1)
10225 return NULL;
10226
10227 kind1 = PyUnicode_KIND(self);
10228 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 len1 = PyUnicode_GET_LENGTH(self);
10230 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010231 if (kind1 < kind2 || len1 < len2) {
10232 out = PyList_New(1);
10233 if (out == NULL)
10234 return NULL;
10235 Py_INCREF(self);
10236 PyList_SET_ITEM(out, 0, self);
10237 return out;
10238 }
10239 buf1 = PyUnicode_DATA(self);
10240 buf2 = PyUnicode_DATA(substring);
10241 if (kind2 != kind1) {
10242 buf2 = _PyUnicode_AsKind(substring, kind1);
10243 if (!buf2)
10244 return NULL;
10245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010247 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10250 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010251 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010252 else
10253 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010254 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 break;
10256 case PyUnicode_2BYTE_KIND:
10257 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 break;
10260 case PyUnicode_4BYTE_KIND:
10261 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 break;
10264 default:
10265 out = NULL;
10266 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010267 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyMem_Free(buf2);
10269 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270}
10271
Alexander Belopolsky40018472011-02-26 01:02:56 +000010272static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010273rsplit(PyObject *self,
10274 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010275 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010276{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010277 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 void *buf1, *buf2;
10279 Py_ssize_t len1, len2;
10280 PyObject* out;
10281
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010282 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010283 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 if (PyUnicode_READY(self) == -1)
10286 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010289 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 if (PyUnicode_IS_ASCII(self))
10292 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010294 PyUnicode_GET_LENGTH(self), maxcount
10295 );
10296 else
10297 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 PyUnicode_GET_LENGTH(self), maxcount
10300 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 case PyUnicode_2BYTE_KIND:
10302 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010303 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 PyUnicode_GET_LENGTH(self), maxcount
10305 );
10306 case PyUnicode_4BYTE_KIND:
10307 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 PyUnicode_GET_LENGTH(self), maxcount
10310 );
10311 default:
10312 assert(0);
10313 return NULL;
10314 }
10315
10316 if (PyUnicode_READY(substring) == -1)
10317 return NULL;
10318
10319 kind1 = PyUnicode_KIND(self);
10320 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 len1 = PyUnicode_GET_LENGTH(self);
10322 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010323 if (kind1 < kind2 || len1 < len2) {
10324 out = PyList_New(1);
10325 if (out == NULL)
10326 return NULL;
10327 Py_INCREF(self);
10328 PyList_SET_ITEM(out, 0, self);
10329 return out;
10330 }
10331 buf1 = PyUnicode_DATA(self);
10332 buf2 = PyUnicode_DATA(substring);
10333 if (kind2 != kind1) {
10334 buf2 = _PyUnicode_AsKind(substring, kind1);
10335 if (!buf2)
10336 return NULL;
10337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010339 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10342 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 else
10345 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 break;
10348 case PyUnicode_2BYTE_KIND:
10349 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 break;
10352 case PyUnicode_4BYTE_KIND:
10353 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 break;
10356 default:
10357 out = NULL;
10358 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 PyMem_Free(buf2);
10361 return out;
10362}
10363
10364static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010365anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10366 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010368 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10371 return asciilib_find(buf1, len1, buf2, len2, offset);
10372 else
10373 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 case PyUnicode_2BYTE_KIND:
10375 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10376 case PyUnicode_4BYTE_KIND:
10377 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10378 }
10379 assert(0);
10380 return -1;
10381}
10382
10383static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10385 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010387 switch (kind) {
10388 case PyUnicode_1BYTE_KIND:
10389 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10390 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10391 else
10392 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10393 case PyUnicode_2BYTE_KIND:
10394 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10395 case PyUnicode_4BYTE_KIND:
10396 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10397 }
10398 assert(0);
10399 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010400}
10401
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010402static void
10403replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10404 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10405{
10406 int kind = PyUnicode_KIND(u);
10407 void *data = PyUnicode_DATA(u);
10408 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10409 if (kind == PyUnicode_1BYTE_KIND) {
10410 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10411 (Py_UCS1 *)data + len,
10412 u1, u2, maxcount);
10413 }
10414 else if (kind == PyUnicode_2BYTE_KIND) {
10415 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10416 (Py_UCS2 *)data + len,
10417 u1, u2, maxcount);
10418 }
10419 else {
10420 assert(kind == PyUnicode_4BYTE_KIND);
10421 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10422 (Py_UCS4 *)data + len,
10423 u1, u2, maxcount);
10424 }
10425}
10426
Alexander Belopolsky40018472011-02-26 01:02:56 +000010427static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428replace(PyObject *self, PyObject *str1,
10429 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 PyObject *u;
10432 char *sbuf = PyUnicode_DATA(self);
10433 char *buf1 = PyUnicode_DATA(str1);
10434 char *buf2 = PyUnicode_DATA(str2);
10435 int srelease = 0, release1 = 0, release2 = 0;
10436 int skind = PyUnicode_KIND(self);
10437 int kind1 = PyUnicode_KIND(str1);
10438 int kind2 = PyUnicode_KIND(str2);
10439 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10440 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10441 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010443 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444
10445 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010448 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449
Victor Stinner59de0ee2011-10-07 10:01:28 +020010450 if (str1 == str2)
10451 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452
Victor Stinner49a0a212011-10-12 23:46:10 +020010453 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010454 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10455 if (maxchar < maxchar_str1)
10456 /* substring too wide to be present */
10457 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010458 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10459 /* Replacing str1 with str2 may cause a maxchar reduction in the
10460 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010461 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010462 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010467 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010470 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010471 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010472
Victor Stinner69ed0f42013-04-09 21:48:24 +020010473 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010474 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010475 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010477 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010481
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010482 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10483 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010484 }
10485 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 int rkind = skind;
10487 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010488 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (kind1 < rkind) {
10491 /* widen substring */
10492 buf1 = _PyUnicode_AsKind(str1, rkind);
10493 if (!buf1) goto error;
10494 release1 = 1;
10495 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010496 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 if (i < 0)
10498 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (rkind > kind2) {
10500 /* widen replacement */
10501 buf2 = _PyUnicode_AsKind(str2, rkind);
10502 if (!buf2) goto error;
10503 release2 = 1;
10504 }
10505 else if (rkind < kind2) {
10506 /* widen self and buf1 */
10507 rkind = kind2;
10508 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010509 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 sbuf = _PyUnicode_AsKind(self, rkind);
10511 if (!sbuf) goto error;
10512 srelease = 1;
10513 buf1 = _PyUnicode_AsKind(str1, rkind);
10514 if (!buf1) goto error;
10515 release1 = 1;
10516 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 u = PyUnicode_New(slen, maxchar);
10518 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 assert(PyUnicode_KIND(u) == rkind);
10521 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010522
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010523 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010524 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010527 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010529
10530 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010531 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010532 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010533 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010534 if (i == -1)
10535 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010536 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010538 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010542 }
10543 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010545 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 int rkind = skind;
10547 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010550 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010556 if (n == 0)
10557 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010559 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 buf2 = _PyUnicode_AsKind(str2, rkind);
10561 if (!buf2) goto error;
10562 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 rkind = kind2;
10567 sbuf = _PyUnicode_AsKind(self, rkind);
10568 if (!sbuf) goto error;
10569 srelease = 1;
10570 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010571 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 buf1 = _PyUnicode_AsKind(str1, rkind);
10573 if (!buf1) goto error;
10574 release1 = 1;
10575 }
10576 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10577 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010578 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 PyErr_SetString(PyExc_OverflowError,
10580 "replace string is too long");
10581 goto error;
10582 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010583 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010585 _Py_INCREF_UNICODE_EMPTY();
10586 if (!unicode_empty)
10587 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 u = unicode_empty;
10589 goto done;
10590 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010591 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 PyErr_SetString(PyExc_OverflowError,
10593 "replace string is too long");
10594 goto error;
10595 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010596 u = PyUnicode_New(new_size, maxchar);
10597 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010599 assert(PyUnicode_KIND(u) == rkind);
10600 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 ires = i = 0;
10602 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 while (n-- > 0) {
10604 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010605 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010606 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010607 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010608 if (j == -1)
10609 break;
10610 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010612 memcpy(res + rkind * ires,
10613 sbuf + rkind * i,
10614 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616 }
10617 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010619 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010621 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010627 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010628 memcpy(res + rkind * ires,
10629 sbuf + rkind * i,
10630 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010631 }
10632 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633 /* interleave */
10634 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010635 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010637 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 if (--n <= 0)
10640 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010641 memcpy(res + rkind * ires,
10642 sbuf + rkind * i,
10643 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 ires++;
10645 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010647 memcpy(res + rkind * ires,
10648 sbuf + rkind * i,
10649 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 }
10652
10653 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010654 unicode_adjust_maxchar(&u);
10655 if (u == NULL)
10656 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010658
10659 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (srelease)
10661 PyMem_FREE(sbuf);
10662 if (release1)
10663 PyMem_FREE(buf1);
10664 if (release2)
10665 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010666 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (srelease)
10672 PyMem_FREE(sbuf);
10673 if (release1)
10674 PyMem_FREE(buf1);
10675 if (release2)
10676 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010677 return unicode_result_unchanged(self);
10678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 error:
10680 if (srelease && sbuf)
10681 PyMem_FREE(sbuf);
10682 if (release1 && buf1)
10683 PyMem_FREE(buf1);
10684 if (release2 && buf2)
10685 PyMem_FREE(buf2);
10686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687}
10688
10689/* --- Unicode Object Methods --------------------------------------------- */
10690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010691PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010692 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693\n\
10694Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010695characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
10697static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010698unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010700 if (PyUnicode_READY(self) == -1)
10701 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010702 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703}
10704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010705PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707\n\
10708Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010709have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
10711static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010712unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010714 if (PyUnicode_READY(self) == -1)
10715 return NULL;
10716 if (PyUnicode_GET_LENGTH(self) == 0)
10717 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010718 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719}
10720
Benjamin Petersond5890c82012-01-14 13:23:30 -050010721PyDoc_STRVAR(casefold__doc__,
10722 "S.casefold() -> str\n\
10723\n\
10724Return a version of S suitable for caseless comparisons.");
10725
10726static PyObject *
10727unicode_casefold(PyObject *self)
10728{
10729 if (PyUnicode_READY(self) == -1)
10730 return NULL;
10731 if (PyUnicode_IS_ASCII(self))
10732 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010733 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010734}
10735
10736
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010737/* Argument converter. Coerces to a single unicode character */
10738
10739static int
10740convert_uc(PyObject *obj, void *addr)
10741{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010744
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745 uniobj = PyUnicode_FromObject(obj);
10746 if (uniobj == NULL) {
10747 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 return 0;
10750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010752 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010753 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010754 Py_DECREF(uniobj);
10755 return 0;
10756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010758 Py_DECREF(uniobj);
10759 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010760}
10761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010762PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010765Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010766done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
10768static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010769unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010771 Py_ssize_t marg, left;
10772 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 Py_UCS4 fillchar = ' ';
10774
Victor Stinnere9a29352011-10-01 02:14:59 +020010775 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777
Benjamin Petersonbac79492012-01-14 13:34:47 -050010778 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 return NULL;
10780
Victor Stinnerc4b49542011-12-11 22:44:26 +010010781 if (PyUnicode_GET_LENGTH(self) >= width)
10782 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
Victor Stinnerc4b49542011-12-11 22:44:26 +010010784 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 left = marg / 2 + (marg & width & 1);
10786
Victor Stinner9310abb2011-10-05 00:59:23 +020010787 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788}
10789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790/* This function assumes that str1 and str2 are readied by the caller. */
10791
Marc-André Lemburge5034372000-08-08 08:04:29 +000010792static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010793unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010794{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010795#define COMPARE(TYPE1, TYPE2) \
10796 do { \
10797 TYPE1* p1 = (TYPE1 *)data1; \
10798 TYPE2* p2 = (TYPE2 *)data2; \
10799 TYPE1* end = p1 + len; \
10800 Py_UCS4 c1, c2; \
10801 for (; p1 != end; p1++, p2++) { \
10802 c1 = *p1; \
10803 c2 = *p2; \
10804 if (c1 != c2) \
10805 return (c1 < c2) ? -1 : 1; \
10806 } \
10807 } \
10808 while (0)
10809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 int kind1, kind2;
10811 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010812 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 kind1 = PyUnicode_KIND(str1);
10815 kind2 = PyUnicode_KIND(str2);
10816 data1 = PyUnicode_DATA(str1);
10817 data2 = PyUnicode_DATA(str2);
10818 len1 = PyUnicode_GET_LENGTH(str1);
10819 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010820 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010821
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010822 switch(kind1) {
10823 case PyUnicode_1BYTE_KIND:
10824 {
10825 switch(kind2) {
10826 case PyUnicode_1BYTE_KIND:
10827 {
10828 int cmp = memcmp(data1, data2, len);
10829 /* normalize result of memcmp() into the range [-1; 1] */
10830 if (cmp < 0)
10831 return -1;
10832 if (cmp > 0)
10833 return 1;
10834 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010835 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010836 case PyUnicode_2BYTE_KIND:
10837 COMPARE(Py_UCS1, Py_UCS2);
10838 break;
10839 case PyUnicode_4BYTE_KIND:
10840 COMPARE(Py_UCS1, Py_UCS4);
10841 break;
10842 default:
10843 assert(0);
10844 }
10845 break;
10846 }
10847 case PyUnicode_2BYTE_KIND:
10848 {
10849 switch(kind2) {
10850 case PyUnicode_1BYTE_KIND:
10851 COMPARE(Py_UCS2, Py_UCS1);
10852 break;
10853 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010854 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010855 COMPARE(Py_UCS2, Py_UCS2);
10856 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010857 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010858 case PyUnicode_4BYTE_KIND:
10859 COMPARE(Py_UCS2, Py_UCS4);
10860 break;
10861 default:
10862 assert(0);
10863 }
10864 break;
10865 }
10866 case PyUnicode_4BYTE_KIND:
10867 {
10868 switch(kind2) {
10869 case PyUnicode_1BYTE_KIND:
10870 COMPARE(Py_UCS4, Py_UCS1);
10871 break;
10872 case PyUnicode_2BYTE_KIND:
10873 COMPARE(Py_UCS4, Py_UCS2);
10874 break;
10875 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010876 {
10877#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10878 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10879 /* normalize result of wmemcmp() into the range [-1; 1] */
10880 if (cmp < 0)
10881 return -1;
10882 if (cmp > 0)
10883 return 1;
10884#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010886#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010888 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010889 default:
10890 assert(0);
10891 }
10892 break;
10893 }
10894 default:
10895 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010896 }
10897
Victor Stinner770e19e2012-10-04 22:59:45 +020010898 if (len1 == len2)
10899 return 0;
10900 if (len1 < len2)
10901 return -1;
10902 else
10903 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010904
10905#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010906}
10907
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010908Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010909unicode_compare_eq(PyObject *str1, PyObject *str2)
10910{
10911 int kind;
10912 void *data1, *data2;
10913 Py_ssize_t len;
10914 int cmp;
10915
Victor Stinnere5567ad2012-10-23 02:48:49 +020010916 len = PyUnicode_GET_LENGTH(str1);
10917 if (PyUnicode_GET_LENGTH(str2) != len)
10918 return 0;
10919 kind = PyUnicode_KIND(str1);
10920 if (PyUnicode_KIND(str2) != kind)
10921 return 0;
10922 data1 = PyUnicode_DATA(str1);
10923 data2 = PyUnicode_DATA(str2);
10924
10925 cmp = memcmp(data1, data2, len * kind);
10926 return (cmp == 0);
10927}
10928
10929
Alexander Belopolsky40018472011-02-26 01:02:56 +000010930int
10931PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10934 if (PyUnicode_READY(left) == -1 ||
10935 PyUnicode_READY(right) == -1)
10936 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010937
10938 /* a string is equal to itself */
10939 if (left == right)
10940 return 0;
10941
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010942 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010944 PyErr_Format(PyExc_TypeError,
10945 "Can't compare %.100s and %.100s",
10946 left->ob_type->tp_name,
10947 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 return -1;
10949}
10950
Martin v. Löwis5b222132007-06-10 09:51:05 +000010951int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010952_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10953{
10954 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10955 if (right_str == NULL)
10956 return -1;
10957 return PyUnicode_Compare(left, right_str);
10958}
10959
10960int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010961PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 Py_ssize_t i;
10964 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 Py_UCS4 chr;
10966
Victor Stinner910337b2011-10-03 03:20:16 +020010967 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (PyUnicode_READY(uni) == -1)
10969 return -1;
10970 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010971 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010972 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010973 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010974 size_t len, len2 = strlen(str);
10975 int cmp;
10976
10977 len = Py_MIN(len1, len2);
10978 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010979 if (cmp != 0) {
10980 if (cmp < 0)
10981 return -1;
10982 else
10983 return 1;
10984 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010985 if (len1 > len2)
10986 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010987 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010988 return -1; /* str is longer */
10989 return 0;
10990 }
10991 else {
10992 void *data = PyUnicode_DATA(uni);
10993 /* Compare Unicode string and source character set string */
10994 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010995 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010996 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10997 /* This check keeps Python strings that end in '\0' from comparing equal
10998 to C strings identical up to that point. */
10999 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11000 return 1; /* uni is longer */
11001 if (str[i])
11002 return -1; /* str is longer */
11003 return 0;
11004 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011005}
11006
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011007
Benjamin Peterson29060642009-01-31 22:14:21 +000011008#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011009 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011010
Alexander Belopolsky40018472011-02-26 01:02:56 +000011011PyObject *
11012PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011013{
11014 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011015 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011016
Victor Stinnere5567ad2012-10-23 02:48:49 +020011017 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11018 Py_RETURN_NOTIMPLEMENTED;
11019
11020 if (PyUnicode_READY(left) == -1 ||
11021 PyUnicode_READY(right) == -1)
11022 return NULL;
11023
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011024 if (left == right) {
11025 switch (op) {
11026 case Py_EQ:
11027 case Py_LE:
11028 case Py_GE:
11029 /* a string is equal to itself */
11030 v = Py_True;
11031 break;
11032 case Py_NE:
11033 case Py_LT:
11034 case Py_GT:
11035 v = Py_False;
11036 break;
11037 default:
11038 PyErr_BadArgument();
11039 return NULL;
11040 }
11041 }
11042 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011043 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011044 result ^= (op == Py_NE);
11045 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011046 }
11047 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011048 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011049
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011050 /* Convert the return value to a Boolean */
11051 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011052 case Py_LE:
11053 v = TEST_COND(result <= 0);
11054 break;
11055 case Py_GE:
11056 v = TEST_COND(result >= 0);
11057 break;
11058 case Py_LT:
11059 v = TEST_COND(result == -1);
11060 break;
11061 case Py_GT:
11062 v = TEST_COND(result == 1);
11063 break;
11064 default:
11065 PyErr_BadArgument();
11066 return NULL;
11067 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011068 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011069 Py_INCREF(v);
11070 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011071}
11072
Alexander Belopolsky40018472011-02-26 01:02:56 +000011073int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011074_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11075{
11076 return unicode_eq(aa, bb);
11077}
11078
11079int
Alexander Belopolsky40018472011-02-26 01:02:56 +000011080PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011081{
Thomas Wouters477c8d52006-05-27 19:21:47 +000011082 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020011083 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 void *buf1, *buf2;
11085 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011086 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011087
11088 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000011089 sub = PyUnicode_FromObject(element);
11090 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011091 PyErr_Format(PyExc_TypeError,
11092 "'in <string>' requires string as left operand, not %s",
11093 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011095 }
11096
Thomas Wouters477c8d52006-05-27 19:21:47 +000011097 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011098 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011099 Py_DECREF(sub);
11100 return -1;
11101 }
11102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 kind1 = PyUnicode_KIND(str);
11104 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011105 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050011107 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011108 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 }
11110 len1 = PyUnicode_GET_LENGTH(str);
11111 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011112 if (len1 < len2) {
11113 Py_DECREF(sub);
11114 Py_DECREF(str);
11115 return 0;
11116 }
11117 buf1 = PyUnicode_DATA(str);
11118 buf2 = PyUnicode_DATA(sub);
11119 if (len2 == 1) {
11120 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11121 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11122 Py_DECREF(sub);
11123 Py_DECREF(str);
11124 return result;
11125 }
11126 if (kind2 != kind1) {
11127 buf2 = _PyUnicode_AsKind(sub, kind1);
11128 if (!buf2) {
11129 Py_DECREF(sub);
11130 Py_DECREF(str);
11131 return -1;
11132 }
11133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134
Victor Stinner77282cb2013-04-14 19:22:47 +020011135 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 case PyUnicode_1BYTE_KIND:
11137 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11138 break;
11139 case PyUnicode_2BYTE_KIND:
11140 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11141 break;
11142 case PyUnicode_4BYTE_KIND:
11143 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11144 break;
11145 default:
11146 result = -1;
11147 assert(0);
11148 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011149
11150 Py_DECREF(str);
11151 Py_DECREF(sub);
11152
Victor Stinner77282cb2013-04-14 19:22:47 +020011153 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 PyMem_Free(buf2);
11155
Guido van Rossum403d68b2000-03-13 15:55:09 +000011156 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011157}
11158
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159/* Concat to string or Unicode object giving a new Unicode object. */
11160
Alexander Belopolsky40018472011-02-26 01:02:56 +000011161PyObject *
11162PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011165 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011166 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
11168 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175
11176 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011177 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011181 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011182 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 }
11185
Victor Stinner488fa492011-12-12 00:01:39 +010011186 u_len = PyUnicode_GET_LENGTH(u);
11187 v_len = PyUnicode_GET_LENGTH(v);
11188 if (u_len > PY_SSIZE_T_MAX - v_len) {
11189 PyErr_SetString(PyExc_OverflowError,
11190 "strings are too large to concat");
11191 goto onError;
11192 }
11193 new_len = u_len + v_len;
11194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011196 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011197 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011200 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011203 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11204 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 Py_DECREF(u);
11206 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011207 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 Py_XDECREF(u);
11212 Py_XDECREF(v);
11213 return NULL;
11214}
11215
Walter Dörwald1ab83302007-05-18 17:15:44 +000011216void
Victor Stinner23e56682011-10-03 03:54:37 +020011217PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011218{
Victor Stinner23e56682011-10-03 03:54:37 +020011219 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011220 Py_UCS4 maxchar, maxchar2;
11221 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011222
11223 if (p_left == NULL) {
11224 if (!PyErr_Occurred())
11225 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011226 return;
11227 }
Victor Stinner23e56682011-10-03 03:54:37 +020011228 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011229 if (right == NULL || left == NULL
11230 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011231 if (!PyErr_Occurred())
11232 PyErr_BadInternalCall();
11233 goto error;
11234 }
11235
Benjamin Petersonbac79492012-01-14 13:34:47 -050011236 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011237 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011238 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011239 goto error;
11240
Victor Stinner488fa492011-12-12 00:01:39 +010011241 /* Shortcuts */
11242 if (left == unicode_empty) {
11243 Py_DECREF(left);
11244 Py_INCREF(right);
11245 *p_left = right;
11246 return;
11247 }
11248 if (right == unicode_empty)
11249 return;
11250
11251 left_len = PyUnicode_GET_LENGTH(left);
11252 right_len = PyUnicode_GET_LENGTH(right);
11253 if (left_len > PY_SSIZE_T_MAX - right_len) {
11254 PyErr_SetString(PyExc_OverflowError,
11255 "strings are too large to concat");
11256 goto error;
11257 }
11258 new_len = left_len + right_len;
11259
11260 if (unicode_modifiable(left)
11261 && PyUnicode_CheckExact(right)
11262 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011263 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11264 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011265 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011266 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011267 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11268 {
11269 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011270 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011271 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011272
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011273 /* copy 'right' into the newly allocated area of 'left' */
11274 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011275 }
Victor Stinner488fa492011-12-12 00:01:39 +010011276 else {
11277 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11278 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011279 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011280
Victor Stinner488fa492011-12-12 00:01:39 +010011281 /* Concat the two Unicode strings */
11282 res = PyUnicode_New(new_len, maxchar);
11283 if (res == NULL)
11284 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011285 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11286 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011287 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011288 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011289 }
11290 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011291 return;
11292
11293error:
Victor Stinner488fa492011-12-12 00:01:39 +010011294 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011295}
11296
11297void
11298PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11299{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011300 PyUnicode_Append(pleft, right);
11301 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011302}
11303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011304PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011307Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011308string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310
11311static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011314 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011315 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011316 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011318 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 void *buf1, *buf2;
11320 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Jesus Ceaac451502011-04-20 17:09:23 +020011322 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11323 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011324 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 kind1 = PyUnicode_KIND(self);
11327 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011328 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011329 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011330 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 len1 = PyUnicode_GET_LENGTH(self);
11333 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011335 if (end - start < len2) {
11336 Py_DECREF(substring);
11337 return PyLong_FromLong(0);
11338 }
11339 buf1 = PyUnicode_DATA(self);
11340 buf2 = PyUnicode_DATA(substring);
11341 if (kind2 != kind1) {
11342 buf2 = _PyUnicode_AsKind(substring, kind1);
11343 if (!buf2) {
11344 Py_DECREF(substring);
11345 return NULL;
11346 }
11347 }
11348 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 case PyUnicode_1BYTE_KIND:
11350 iresult = ucs1lib_count(
11351 ((Py_UCS1*)buf1) + start, end - start,
11352 buf2, len2, PY_SSIZE_T_MAX
11353 );
11354 break;
11355 case PyUnicode_2BYTE_KIND:
11356 iresult = ucs2lib_count(
11357 ((Py_UCS2*)buf1) + start, end - start,
11358 buf2, len2, PY_SSIZE_T_MAX
11359 );
11360 break;
11361 case PyUnicode_4BYTE_KIND:
11362 iresult = ucs4lib_count(
11363 ((Py_UCS4*)buf1) + start, end - start,
11364 buf2, len2, PY_SSIZE_T_MAX
11365 );
11366 break;
11367 default:
11368 assert(0); iresult = 0;
11369 }
11370
11371 result = PyLong_FromSsize_t(iresult);
11372
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011373 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
11376 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011377
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378 return result;
11379}
11380
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011381PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011382 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011384Encode S using the codec registered for encoding. Default encoding\n\
11385is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011386handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011387a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11388'xmlcharrefreplace' as well as any other name registered with\n\
11389codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
11391static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011392unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011394 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395 char *encoding = NULL;
11396 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011397
Benjamin Peterson308d6372009-09-18 21:42:35 +000011398 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11399 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011401 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011402}
11403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011405 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406\n\
11407Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011408If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
11410static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011411unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011413 Py_ssize_t i, j, line_pos, src_len, incr;
11414 Py_UCS4 ch;
11415 PyObject *u;
11416 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011417 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011419 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011420 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Ezio Melotti745d54d2013-11-16 19:10:57 +020011422 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11423 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Antoine Pitrou22425222011-10-04 19:10:51 +020011426 if (PyUnicode_READY(self) == -1)
11427 return NULL;
11428
Thomas Wouters7e474022000-07-16 12:04:32 +000011429 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011430 src_len = PyUnicode_GET_LENGTH(self);
11431 i = j = line_pos = 0;
11432 kind = PyUnicode_KIND(self);
11433 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011434 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011435 for (; i < src_len; i++) {
11436 ch = PyUnicode_READ(kind, src_data, i);
11437 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011438 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011440 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011442 goto overflow;
11443 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011445 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011449 goto overflow;
11450 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011452 if (ch == '\n' || ch == '\r')
11453 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011455 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011456 if (!found)
11457 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011458
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011460 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 if (!u)
11462 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011463 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
Antoine Pitroue71d5742011-10-04 15:55:09 +020011465 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Antoine Pitroue71d5742011-10-04 15:55:09 +020011467 for (; i < src_len; i++) {
11468 ch = PyUnicode_READ(kind, src_data, i);
11469 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011471 incr = tabsize - (line_pos % tabsize);
11472 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011473 FILL(kind, dest_data, ' ', j, incr);
11474 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011476 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011478 line_pos++;
11479 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011480 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011481 if (ch == '\n' || ch == '\r')
11482 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011484 }
11485 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011486 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011487
Antoine Pitroue71d5742011-10-04 15:55:09 +020011488 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011489 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491}
11492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495\n\
11496Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011497such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498arguments start and end are interpreted as in slice notation.\n\
11499\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011505 /* initialize variables to prevent gcc warning */
11506 PyObject *substring = NULL;
11507 Py_ssize_t start = 0;
11508 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011509 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Jesus Ceaac451502011-04-20 17:09:23 +020011511 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11512 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
Christian Heimesd47802e2013-06-29 21:33:36 +020011515 if (PyUnicode_READY(self) == -1) {
11516 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011518 }
11519 if (PyUnicode_READY(substring) == -1) {
11520 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523
Victor Stinner7931d9a2011-11-04 00:22:48 +010011524 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525
11526 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (result == -2)
11529 return NULL;
11530
Christian Heimes217cfd12007-12-02 14:31:20 +000011531 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532}
11533
11534static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011535unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011537 void *data;
11538 enum PyUnicode_Kind kind;
11539 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011540
11541 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11542 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011544 }
11545 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11546 PyErr_SetString(PyExc_IndexError, "string index out of range");
11547 return NULL;
11548 }
11549 kind = PyUnicode_KIND(self);
11550 data = PyUnicode_DATA(self);
11551 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011552 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553}
11554
Guido van Rossumc2504932007-09-18 19:42:40 +000011555/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011556 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011557static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011558unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Guido van Rossumc2504932007-09-18 19:42:40 +000011560 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011561 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011562
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011563#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011564 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011565#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 if (_PyUnicode_HASH(self) != -1)
11567 return _PyUnicode_HASH(self);
11568 if (PyUnicode_READY(self) == -1)
11569 return -1;
11570 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011571 /*
11572 We make the hash of the empty string be 0, rather than using
11573 (prefix ^ suffix), since this slightly obfuscates the hash secret
11574 */
11575 if (len == 0) {
11576 _PyUnicode_HASH(self) = 0;
11577 return 0;
11578 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011579 x = _Py_HashBytes(PyUnicode_DATA(self),
11580 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011582 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583}
11584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011588Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589
11590static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011593 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011594 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011595 PyObject *substring = NULL;
11596 Py_ssize_t start = 0;
11597 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
Jesus Ceaac451502011-04-20 17:09:23 +020011599 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11600 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
Christian Heimesd47a0452013-06-29 21:21:37 +020011603 if (PyUnicode_READY(self) == -1) {
11604 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011606 }
11607 if (PyUnicode_READY(substring) == -1) {
11608 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611
Victor Stinner7931d9a2011-11-04 00:22:48 +010011612 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
11614 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if (result == -2)
11617 return NULL;
11618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 if (result < 0) {
11620 PyErr_SetString(PyExc_ValueError, "substring not found");
11621 return NULL;
11622 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623
Christian Heimes217cfd12007-12-02 14:31:20 +000011624 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625}
11626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011630Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632
11633static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011634unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 Py_ssize_t i, length;
11637 int kind;
11638 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 int cased;
11640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (PyUnicode_READY(self) == -1)
11642 return NULL;
11643 length = PyUnicode_GET_LENGTH(self);
11644 kind = PyUnicode_KIND(self);
11645 data = PyUnicode_DATA(self);
11646
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (length == 1)
11649 return PyBool_FromLong(
11650 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011652 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011654 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011655
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 for (i = 0; i < length; i++) {
11658 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011659
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11661 return PyBool_FromLong(0);
11662 else if (!cased && Py_UNICODE_ISLOWER(ch))
11663 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011665 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666}
11667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011668PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011671Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011672at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673
11674static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011675unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 Py_ssize_t i, length;
11678 int kind;
11679 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680 int cased;
11681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (PyUnicode_READY(self) == -1)
11683 return NULL;
11684 length = PyUnicode_GET_LENGTH(self);
11685 kind = PyUnicode_KIND(self);
11686 data = PyUnicode_DATA(self);
11687
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 if (length == 1)
11690 return PyBool_FromLong(
11691 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011693 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011696
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 for (i = 0; i < length; i++) {
11699 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011700
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11702 return PyBool_FromLong(0);
11703 else if (!cased && Py_UNICODE_ISUPPER(ch))
11704 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011706 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707}
11708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011709PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011712Return True if S is a titlecased string and there is at least one\n\
11713character in S, i.e. upper- and titlecase characters may only\n\
11714follow uncased characters and lowercase characters only cased ones.\n\
11715Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716
11717static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011718unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 Py_ssize_t i, length;
11721 int kind;
11722 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 int cased, previous_is_cased;
11724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 if (PyUnicode_READY(self) == -1)
11726 return NULL;
11727 length = PyUnicode_GET_LENGTH(self);
11728 kind = PyUnicode_KIND(self);
11729 data = PyUnicode_DATA(self);
11730
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 if (length == 1) {
11733 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11734 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11735 (Py_UNICODE_ISUPPER(ch) != 0));
11736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011738 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011741
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 cased = 0;
11743 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 for (i = 0; i < length; i++) {
11745 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011746
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11748 if (previous_is_cased)
11749 return PyBool_FromLong(0);
11750 previous_is_cased = 1;
11751 cased = 1;
11752 }
11753 else if (Py_UNICODE_ISLOWER(ch)) {
11754 if (!previous_is_cased)
11755 return PyBool_FromLong(0);
11756 previous_is_cased = 1;
11757 cased = 1;
11758 }
11759 else
11760 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011762 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763}
11764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011768Return True if all characters in S are whitespace\n\
11769and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
11771static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 Py_ssize_t i, length;
11775 int kind;
11776 void *data;
11777
11778 if (PyUnicode_READY(self) == -1)
11779 return NULL;
11780 length = PyUnicode_GET_LENGTH(self);
11781 kind = PyUnicode_KIND(self);
11782 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 1)
11786 return PyBool_FromLong(
11787 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011789 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 for (i = 0; i < length; i++) {
11794 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011795 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011798 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011803\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011804Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011805and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011806
11807static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 Py_ssize_t i, length;
11811 int kind;
11812 void *data;
11813
11814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816 length = PyUnicode_GET_LENGTH(self);
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011819
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011820 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 1)
11822 return PyBool_FromLong(
11823 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011824
11825 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 for (i = 0; i < length; i++) {
11830 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011832 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011833 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011834}
11835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011838\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011839Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011840and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011841
11842static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011843unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 int kind;
11846 void *data;
11847 Py_ssize_t len, i;
11848
11849 if (PyUnicode_READY(self) == -1)
11850 return NULL;
11851
11852 kind = PyUnicode_KIND(self);
11853 data = PyUnicode_DATA(self);
11854 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011855
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011856 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (len == 1) {
11858 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11859 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11860 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011861
11862 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 for (i = 0; i < len; i++) {
11867 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011868 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011871 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011872}
11873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011874PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011877Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011878False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
11880static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011881unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 Py_ssize_t i, length;
11884 int kind;
11885 void *data;
11886
11887 if (PyUnicode_READY(self) == -1)
11888 return NULL;
11889 length = PyUnicode_GET_LENGTH(self);
11890 kind = PyUnicode_KIND(self);
11891 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (length == 1)
11895 return PyBool_FromLong(
11896 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011898 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 for (i = 0; i < length; i++) {
11903 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907}
11908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011909PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011912Return True if all characters in S are digits\n\
11913and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
11915static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011916unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 Py_ssize_t i, length;
11919 int kind;
11920 void *data;
11921
11922 if (PyUnicode_READY(self) == -1)
11923 return NULL;
11924 length = PyUnicode_GET_LENGTH(self);
11925 kind = PyUnicode_KIND(self);
11926 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 if (length == 1) {
11930 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11931 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < length; i++) {
11939 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011942 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943}
11944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011945PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011948Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011949False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950
11951static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011952unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 Py_ssize_t i, length;
11955 int kind;
11956 void *data;
11957
11958 if (PyUnicode_READY(self) == -1)
11959 return NULL;
11960 length = PyUnicode_GET_LENGTH(self);
11961 kind = PyUnicode_KIND(self);
11962 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 if (length == 1)
11966 return PyBool_FromLong(
11967 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011969 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 for (i = 0; i < length; i++) {
11974 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011977 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978}
11979
Martin v. Löwis47383402007-08-15 07:32:56 +000011980int
11981PyUnicode_IsIdentifier(PyObject *self)
11982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 int kind;
11984 void *data;
11985 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011986 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (PyUnicode_READY(self) == -1) {
11989 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 }
11992
11993 /* Special case for empty strings */
11994 if (PyUnicode_GET_LENGTH(self) == 0)
11995 return 0;
11996 kind = PyUnicode_KIND(self);
11997 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011998
11999 /* PEP 3131 says that the first character must be in
12000 XID_Start and subsequent characters in XID_Continue,
12001 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012002 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012003 letters, digits, underscore). However, given the current
12004 definition of XID_Start and XID_Continue, it is sufficient
12005 to check just for these, except that _ must be allowed
12006 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012008 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012009 return 0;
12010
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012011 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012014 return 1;
12015}
12016
12017PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000012019\n\
12020Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070012021to the language definition.\n\
12022\n\
12023Use keyword.iskeyword() to test for reserved identifiers\n\
12024such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000012025
12026static PyObject*
12027unicode_isidentifier(PyObject *self)
12028{
12029 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12030}
12031
Georg Brandl559e5d72008-06-11 18:37:52 +000012032PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000012034\n\
12035Return True if all characters in S are considered\n\
12036printable in repr() or S is empty, False otherwise.");
12037
12038static PyObject*
12039unicode_isprintable(PyObject *self)
12040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 Py_ssize_t i, length;
12042 int kind;
12043 void *data;
12044
12045 if (PyUnicode_READY(self) == -1)
12046 return NULL;
12047 length = PyUnicode_GET_LENGTH(self);
12048 kind = PyUnicode_KIND(self);
12049 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012050
12051 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 if (length == 1)
12053 return PyBool_FromLong(
12054 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 for (i = 0; i < length; i++) {
12057 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012058 Py_RETURN_FALSE;
12059 }
12060 }
12061 Py_RETURN_TRUE;
12062}
12063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012064PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012065 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066\n\
12067Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012068iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069
12070static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012071unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012073 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074}
12075
Martin v. Löwis18e16552006-02-15 17:27:45 +000012076static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012077unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if (PyUnicode_READY(self) == -1)
12080 return -1;
12081 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082}
12083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012084PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012087Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012088done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089
12090static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012091unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012093 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 Py_UCS4 fillchar = ' ';
12095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012096 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 return NULL;
12098
Benjamin Petersonbac79492012-01-14 13:34:47 -050012099 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101
Victor Stinnerc4b49542011-12-11 22:44:26 +010012102 if (PyUnicode_GET_LENGTH(self) >= width)
12103 return unicode_result_unchanged(self);
12104
12105 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106}
12107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012108PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
12113static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012114unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012116 if (PyUnicode_READY(self) == -1)
12117 return NULL;
12118 if (PyUnicode_IS_ASCII(self))
12119 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012120 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121}
12122
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123#define LEFTSTRIP 0
12124#define RIGHTSTRIP 1
12125#define BOTHSTRIP 2
12126
12127/* Arrays indexed by above */
12128static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12129
12130#define STRIPNAME(i) (stripformat[i]+3)
12131
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132/* externally visible for str.strip(unicode) */
12133PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012134_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 void *data;
12137 int kind;
12138 Py_ssize_t i, j, len;
12139 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012140 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12143 return NULL;
12144
12145 kind = PyUnicode_KIND(self);
12146 data = PyUnicode_DATA(self);
12147 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012148 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12150 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012151 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012152
Benjamin Peterson14339b62009-01-31 16:36:08 +000012153 i = 0;
12154 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012155 while (i < len) {
12156 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12157 if (!BLOOM(sepmask, ch))
12158 break;
12159 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12160 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012161 i++;
12162 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012163 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 j = len;
12166 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012167 j--;
12168 while (j >= i) {
12169 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12170 if (!BLOOM(sepmask, ch))
12171 break;
12172 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12173 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012174 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012175 }
12176
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012178 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012179
Victor Stinner7931d9a2011-11-04 00:22:48 +010012180 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181}
12182
12183PyObject*
12184PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12185{
12186 unsigned char *data;
12187 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012188 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189
Victor Stinnerde636f32011-10-01 03:55:54 +020012190 if (PyUnicode_READY(self) == -1)
12191 return NULL;
12192
Victor Stinner684d5fd2012-05-03 02:32:34 +020012193 length = PyUnicode_GET_LENGTH(self);
12194 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012195
Victor Stinner684d5fd2012-05-03 02:32:34 +020012196 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012197 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198
Victor Stinnerde636f32011-10-01 03:55:54 +020012199 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012200 PyErr_SetString(PyExc_IndexError, "string index out of range");
12201 return NULL;
12202 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012203 if (start >= length || end < start)
12204 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012205
Victor Stinner684d5fd2012-05-03 02:32:34 +020012206 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012207 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012208 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012209 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012210 }
12211 else {
12212 kind = PyUnicode_KIND(self);
12213 data = PyUnicode_1BYTE_DATA(self);
12214 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012215 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012216 length);
12217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219
12220static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012221do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 Py_ssize_t len, i, j;
12224
12225 if (PyUnicode_READY(self) == -1)
12226 return NULL;
12227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012229
Victor Stinnercc7af722013-04-09 22:39:24 +020012230 if (PyUnicode_IS_ASCII(self)) {
12231 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12232
12233 i = 0;
12234 if (striptype != RIGHTSTRIP) {
12235 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012236 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012237 if (!_Py_ascii_whitespace[ch])
12238 break;
12239 i++;
12240 }
12241 }
12242
12243 j = len;
12244 if (striptype != LEFTSTRIP) {
12245 j--;
12246 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012247 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012248 if (!_Py_ascii_whitespace[ch])
12249 break;
12250 j--;
12251 }
12252 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012253 }
12254 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012255 else {
12256 int kind = PyUnicode_KIND(self);
12257 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012258
Victor Stinnercc7af722013-04-09 22:39:24 +020012259 i = 0;
12260 if (striptype != RIGHTSTRIP) {
12261 while (i < len) {
12262 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12263 if (!Py_UNICODE_ISSPACE(ch))
12264 break;
12265 i++;
12266 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012267 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012268
12269 j = len;
12270 if (striptype != LEFTSTRIP) {
12271 j--;
12272 while (j >= i) {
12273 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12274 if (!Py_UNICODE_ISSPACE(ch))
12275 break;
12276 j--;
12277 }
12278 j++;
12279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012280 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012281
Victor Stinner7931d9a2011-11-04 00:22:48 +010012282 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283}
12284
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285
12286static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012287do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012289 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290
Serhiy Storchakac6792272013-10-19 21:03:34 +030012291 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012292 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293
Benjamin Peterson14339b62009-01-31 16:36:08 +000012294 if (sep != NULL && sep != Py_None) {
12295 if (PyUnicode_Check(sep))
12296 return _PyUnicode_XStrip(self, striptype, sep);
12297 else {
12298 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 "%s arg must be None or str",
12300 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012301 return NULL;
12302 }
12303 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
Benjamin Peterson14339b62009-01-31 16:36:08 +000012305 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012306}
12307
12308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012309PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012311\n\
12312Return a copy of the string S with leading and trailing\n\
12313whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012314If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012315
12316static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012317unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 if (PyTuple_GET_SIZE(args) == 0)
12320 return do_strip(self, BOTHSTRIP); /* Common case */
12321 else
12322 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323}
12324
12325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012326PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012328\n\
12329Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012330If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331
12332static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012333unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012335 if (PyTuple_GET_SIZE(args) == 0)
12336 return do_strip(self, LEFTSTRIP); /* Common case */
12337 else
12338 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012339}
12340
12341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012342PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012344\n\
12345Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012346If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012347
12348static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012349unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012350{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012351 if (PyTuple_GET_SIZE(args) == 0)
12352 return do_strip(self, RIGHTSTRIP); /* Common case */
12353 else
12354 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012355}
12356
12357
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012359unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012361 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
Serhiy Storchaka05997252013-01-26 12:14:02 +020012364 if (len < 1)
12365 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366
Victor Stinnerc4b49542011-12-11 22:44:26 +010012367 /* no repeat, return original string */
12368 if (len == 1)
12369 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012370
Benjamin Petersonbac79492012-01-14 13:34:47 -050012371 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 return NULL;
12373
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012374 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012375 PyErr_SetString(PyExc_OverflowError,
12376 "repeated string is too long");
12377 return NULL;
12378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012380
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012381 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 if (!u)
12383 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012384 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 if (PyUnicode_GET_LENGTH(str) == 1) {
12387 const int kind = PyUnicode_KIND(str);
12388 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012389 if (kind == PyUnicode_1BYTE_KIND) {
12390 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012391 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012392 }
12393 else if (kind == PyUnicode_2BYTE_KIND) {
12394 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012395 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012396 ucs2[n] = fill_char;
12397 } else {
12398 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12399 assert(kind == PyUnicode_4BYTE_KIND);
12400 for (n = 0; n < len; ++n)
12401 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 }
12404 else {
12405 /* number of characters copied this far */
12406 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012407 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 char *to = (char *) PyUnicode_DATA(u);
12409 Py_MEMCPY(to, PyUnicode_DATA(str),
12410 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 n = (done <= nchars-done) ? done : nchars-done;
12413 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012414 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 }
12417
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012418 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012419 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420}
12421
Alexander Belopolsky40018472011-02-26 01:02:56 +000012422PyObject *
12423PyUnicode_Replace(PyObject *obj,
12424 PyObject *subobj,
12425 PyObject *replobj,
12426 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427{
12428 PyObject *self;
12429 PyObject *str1;
12430 PyObject *str2;
12431 PyObject *result;
12432
12433 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012434 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012437 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 Py_DECREF(self);
12439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440 }
12441 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012442 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 Py_DECREF(self);
12444 Py_DECREF(str1);
12445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012447 if (PyUnicode_READY(self) == -1 ||
12448 PyUnicode_READY(str1) == -1 ||
12449 PyUnicode_READY(str2) == -1)
12450 result = NULL;
12451 else
12452 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453 Py_DECREF(self);
12454 Py_DECREF(str1);
12455 Py_DECREF(str2);
12456 return result;
12457}
12458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012459PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012460 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461\n\
12462Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012463old replaced by new. If the optional argument count is\n\
12464given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465
12466static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 PyObject *str1;
12470 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012471 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472 PyObject *result;
12473
Martin v. Löwis18e16552006-02-15 17:27:45 +000012474 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012476 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012479 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 return NULL;
12481 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012482 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 Py_DECREF(str1);
12484 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012485 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012486 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12487 result = NULL;
12488 else
12489 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491 Py_DECREF(str1);
12492 Py_DECREF(str2);
12493 return result;
12494}
12495
Alexander Belopolsky40018472011-02-26 01:02:56 +000012496static PyObject *
12497unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012499 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 Py_ssize_t isize;
12501 Py_ssize_t osize, squote, dquote, i, o;
12502 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012503 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012507 return NULL;
12508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 isize = PyUnicode_GET_LENGTH(unicode);
12510 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 /* Compute length of output, quote characters, and
12513 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012514 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 max = 127;
12516 squote = dquote = 0;
12517 ikind = PyUnicode_KIND(unicode);
12518 for (i = 0; i < isize; i++) {
12519 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012520 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012522 case '\'': squote++; break;
12523 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012525 incr = 2;
12526 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 default:
12528 /* Fast-path ASCII */
12529 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012530 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012532 ;
12533 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012536 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012538 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012540 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012542 if (osize > PY_SSIZE_T_MAX - incr) {
12543 PyErr_SetString(PyExc_OverflowError,
12544 "string is too long to generate repr");
12545 return NULL;
12546 }
12547 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 }
12549
12550 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012551 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012553 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 if (dquote)
12555 /* Both squote and dquote present. Use squote,
12556 and escape them */
12557 osize += squote;
12558 else
12559 quote = '"';
12560 }
Victor Stinner55c08782013-04-14 18:45:39 +020012561 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562
12563 repr = PyUnicode_New(osize, max);
12564 if (repr == NULL)
12565 return NULL;
12566 okind = PyUnicode_KIND(repr);
12567 odata = PyUnicode_DATA(repr);
12568
12569 PyUnicode_WRITE(okind, odata, 0, quote);
12570 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012571 if (unchanged) {
12572 _PyUnicode_FastCopyCharacters(repr, 1,
12573 unicode, 0,
12574 isize);
12575 }
12576 else {
12577 for (i = 0, o = 1; i < isize; i++) {
12578 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579
Victor Stinner55c08782013-04-14 18:45:39 +020012580 /* Escape quotes and backslashes */
12581 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012582 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012584 continue;
12585 }
12586
12587 /* Map special whitespace to '\t', \n', '\r' */
12588 if (ch == '\t') {
12589 PyUnicode_WRITE(okind, odata, o++, '\\');
12590 PyUnicode_WRITE(okind, odata, o++, 't');
12591 }
12592 else if (ch == '\n') {
12593 PyUnicode_WRITE(okind, odata, o++, '\\');
12594 PyUnicode_WRITE(okind, odata, o++, 'n');
12595 }
12596 else if (ch == '\r') {
12597 PyUnicode_WRITE(okind, odata, o++, '\\');
12598 PyUnicode_WRITE(okind, odata, o++, 'r');
12599 }
12600
12601 /* Map non-printable US ASCII to '\xhh' */
12602 else if (ch < ' ' || ch == 0x7F) {
12603 PyUnicode_WRITE(okind, odata, o++, '\\');
12604 PyUnicode_WRITE(okind, odata, o++, 'x');
12605 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12606 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12607 }
12608
12609 /* Copy ASCII characters as-is */
12610 else if (ch < 0x7F) {
12611 PyUnicode_WRITE(okind, odata, o++, ch);
12612 }
12613
12614 /* Non-ASCII characters */
12615 else {
12616 /* Map Unicode whitespace and control characters
12617 (categories Z* and C* except ASCII space)
12618 */
12619 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12620 PyUnicode_WRITE(okind, odata, o++, '\\');
12621 /* Map 8-bit characters to '\xhh' */
12622 if (ch <= 0xff) {
12623 PyUnicode_WRITE(okind, odata, o++, 'x');
12624 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12625 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12626 }
12627 /* Map 16-bit characters to '\uxxxx' */
12628 else if (ch <= 0xffff) {
12629 PyUnicode_WRITE(okind, odata, o++, 'u');
12630 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12631 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12632 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12633 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12634 }
12635 /* Map 21-bit characters to '\U00xxxxxx' */
12636 else {
12637 PyUnicode_WRITE(okind, odata, o++, 'U');
12638 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12639 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12640 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12641 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12642 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12643 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12644 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12645 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12646 }
12647 }
12648 /* Copy characters as-is */
12649 else {
12650 PyUnicode_WRITE(okind, odata, o++, ch);
12651 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012652 }
12653 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012656 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012657 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658}
12659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012660PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662\n\
12663Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012664such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665arguments start and end are interpreted as in slice notation.\n\
12666\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012667Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668
12669static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012672 /* initialize variables to prevent gcc warning */
12673 PyObject *substring = NULL;
12674 Py_ssize_t start = 0;
12675 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012676 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
Jesus Ceaac451502011-04-20 17:09:23 +020012678 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12679 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681
Christian Heimesea71a522013-06-29 21:17:34 +020012682 if (PyUnicode_READY(self) == -1) {
12683 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012685 }
12686 if (PyUnicode_READY(substring) == -1) {
12687 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690
Victor Stinner7931d9a2011-11-04 00:22:48 +010012691 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
12693 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 if (result == -2)
12696 return NULL;
12697
Christian Heimes217cfd12007-12-02 14:31:20 +000012698 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699}
12700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012701PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012704Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705
12706static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012709 /* initialize variables to prevent gcc warning */
12710 PyObject *substring = NULL;
12711 Py_ssize_t start = 0;
12712 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012713 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
Jesus Ceaac451502011-04-20 17:09:23 +020012715 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12716 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
Christian Heimesea71a522013-06-29 21:17:34 +020012719 if (PyUnicode_READY(self) == -1) {
12720 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012722 }
12723 if (PyUnicode_READY(substring) == -1) {
12724 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012726 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727
Victor Stinner7931d9a2011-11-04 00:22:48 +010012728 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
12730 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 if (result == -2)
12733 return NULL;
12734
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 if (result < 0) {
12736 PyErr_SetString(PyExc_ValueError, "substring not found");
12737 return NULL;
12738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739
Christian Heimes217cfd12007-12-02 14:31:20 +000012740 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741}
12742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012743PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012746Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012747done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748
12749static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012750unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012752 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 Py_UCS4 fillchar = ' ';
12754
Victor Stinnere9a29352011-10-01 02:14:59 +020012755 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012757
Benjamin Petersonbac79492012-01-14 13:34:47 -050012758 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759 return NULL;
12760
Victor Stinnerc4b49542011-12-11 22:44:26 +010012761 if (PyUnicode_GET_LENGTH(self) >= width)
12762 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763
Victor Stinnerc4b49542011-12-11 22:44:26 +010012764 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765}
12766
Alexander Belopolsky40018472011-02-26 01:02:56 +000012767PyObject *
12768PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769{
12770 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012771
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772 s = PyUnicode_FromObject(s);
12773 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012774 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 if (sep != NULL) {
12776 sep = PyUnicode_FromObject(sep);
12777 if (sep == NULL) {
12778 Py_DECREF(s);
12779 return NULL;
12780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781 }
12782
Victor Stinner9310abb2011-10-05 00:59:23 +020012783 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784
12785 Py_DECREF(s);
12786 Py_XDECREF(sep);
12787 return result;
12788}
12789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012790PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012791 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792\n\
12793Return a list of the words in S, using sep as the\n\
12794delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012795splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012796whitespace string is a separator and empty strings are\n\
12797removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798
12799static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012800unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012802 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012804 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012806 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12807 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808 return NULL;
12809
12810 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012813 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012815 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816}
12817
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818PyObject *
12819PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12820{
12821 PyObject* str_obj;
12822 PyObject* sep_obj;
12823 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012824 int kind1, kind2;
12825 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012827
12828 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012829 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012831 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012832 if (!sep_obj) {
12833 Py_DECREF(str_obj);
12834 return NULL;
12835 }
12836 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12837 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012838 Py_DECREF(str_obj);
12839 return NULL;
12840 }
12841
Victor Stinner14f8f022011-10-05 20:58:25 +020012842 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 len1 = PyUnicode_GET_LENGTH(str_obj);
12845 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012846 if (kind1 < kind2 || len1 < len2) {
12847 _Py_INCREF_UNICODE_EMPTY();
12848 if (!unicode_empty)
12849 out = NULL;
12850 else {
12851 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12852 Py_DECREF(unicode_empty);
12853 }
12854 Py_DECREF(sep_obj);
12855 Py_DECREF(str_obj);
12856 return out;
12857 }
12858 buf1 = PyUnicode_DATA(str_obj);
12859 buf2 = PyUnicode_DATA(sep_obj);
12860 if (kind2 != kind1) {
12861 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12862 if (!buf2)
12863 goto onError;
12864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012866 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012868 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12869 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12870 else
12871 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 break;
12873 case PyUnicode_2BYTE_KIND:
12874 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12875 break;
12876 case PyUnicode_4BYTE_KIND:
12877 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12878 break;
12879 default:
12880 assert(0);
12881 out = 0;
12882 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012883
12884 Py_DECREF(sep_obj);
12885 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012886 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012888
12889 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 onError:
12891 Py_DECREF(sep_obj);
12892 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012893 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 PyMem_Free(buf2);
12895 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896}
12897
12898
12899PyObject *
12900PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12901{
12902 PyObject* str_obj;
12903 PyObject* sep_obj;
12904 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012905 int kind1, kind2;
12906 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908
12909 str_obj = PyUnicode_FromObject(str_in);
12910 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912 sep_obj = PyUnicode_FromObject(sep_in);
12913 if (!sep_obj) {
12914 Py_DECREF(str_obj);
12915 return NULL;
12916 }
12917
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012918 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 len1 = PyUnicode_GET_LENGTH(str_obj);
12921 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 if (kind1 < kind2 || len1 < len2) {
12923 _Py_INCREF_UNICODE_EMPTY();
12924 if (!unicode_empty)
12925 out = NULL;
12926 else {
12927 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12928 Py_DECREF(unicode_empty);
12929 }
12930 Py_DECREF(sep_obj);
12931 Py_DECREF(str_obj);
12932 return out;
12933 }
12934 buf1 = PyUnicode_DATA(str_obj);
12935 buf2 = PyUnicode_DATA(sep_obj);
12936 if (kind2 != kind1) {
12937 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12938 if (!buf2)
12939 goto onError;
12940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012942 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012944 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12945 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12946 else
12947 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 break;
12949 case PyUnicode_2BYTE_KIND:
12950 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12951 break;
12952 case PyUnicode_4BYTE_KIND:
12953 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12954 break;
12955 default:
12956 assert(0);
12957 out = 0;
12958 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012959
12960 Py_DECREF(sep_obj);
12961 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012962 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012964
12965 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 onError:
12967 Py_DECREF(sep_obj);
12968 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012969 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 PyMem_Free(buf2);
12971 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972}
12973
12974PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012975 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012977Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012979found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980
12981static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012982unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012983{
Victor Stinner9310abb2011-10-05 00:59:23 +020012984 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985}
12986
12987PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012988 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012990Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012992separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012993
12994static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012995unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996{
Victor Stinner9310abb2011-10-05 00:59:23 +020012997 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998}
12999
Alexander Belopolsky40018472011-02-26 01:02:56 +000013000PyObject *
13001PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013002{
13003 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013005 s = PyUnicode_FromObject(s);
13006 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 if (sep != NULL) {
13009 sep = PyUnicode_FromObject(sep);
13010 if (sep == NULL) {
13011 Py_DECREF(s);
13012 return NULL;
13013 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013014 }
13015
Victor Stinner9310abb2011-10-05 00:59:23 +020013016 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013017
13018 Py_DECREF(s);
13019 Py_XDECREF(sep);
13020 return result;
13021}
13022
13023PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013024 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013025\n\
13026Return a list of the words in S, using sep as the\n\
13027delimiter string, starting at the end of the string and\n\
13028working to the front. If maxsplit is given, at most maxsplit\n\
13029splits are done. If sep is not specified, any whitespace string\n\
13030is a separator.");
13031
13032static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013033unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013034{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013035 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013036 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013037 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013038
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013039 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
13040 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013041 return NULL;
13042
13043 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013045 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020013046 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013047 else
Victor Stinner9310abb2011-10-05 00:59:23 +020013048 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013049}
13050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013051PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053\n\
13054Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000013055Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013056is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057
13058static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013059unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013061 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000013062 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013064 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
13065 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066 return NULL;
13067
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013068 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069}
13070
13071static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013072PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013074 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075}
13076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013077PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079\n\
13080Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013081and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082
13083static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013084unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013086 if (PyUnicode_READY(self) == -1)
13087 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013088 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089}
13090
Larry Hastings61272b72014-01-07 12:41:53 -080013091/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013092
Larry Hastings31826802013-10-19 00:09:25 -070013093@staticmethod
13094str.maketrans as unicode_maketrans
13095
13096 x: object
13097
13098 y: unicode=NULL
13099
13100 z: unicode=NULL
13101
13102 /
13103
13104Return a translation table usable for str.translate().
13105
13106If there is only one argument, it must be a dictionary mapping Unicode
13107ordinals (integers) or characters to Unicode ordinals, strings or None.
13108Character keys will be then converted to ordinals.
13109If there are two arguments, they must be strings of equal length, and
13110in the resulting dictionary, each character in x will be mapped to the
13111character at the same position in y. If there is a third argument, it
13112must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013113[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013114
Larry Hastings31826802013-10-19 00:09:25 -070013115static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013116unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013117/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013118{
Georg Brandlceee0772007-11-27 23:48:05 +000013119 PyObject *new = NULL, *key, *value;
13120 Py_ssize_t i = 0;
13121 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122
Georg Brandlceee0772007-11-27 23:48:05 +000013123 new = PyDict_New();
13124 if (!new)
13125 return NULL;
13126 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 int x_kind, y_kind, z_kind;
13128 void *x_data, *y_data, *z_data;
13129
Georg Brandlceee0772007-11-27 23:48:05 +000013130 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013131 if (!PyUnicode_Check(x)) {
13132 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13133 "be a string if there is a second argument");
13134 goto err;
13135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013137 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13138 "arguments must have equal length");
13139 goto err;
13140 }
13141 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 x_kind = PyUnicode_KIND(x);
13143 y_kind = PyUnicode_KIND(y);
13144 x_data = PyUnicode_DATA(x);
13145 y_data = PyUnicode_DATA(y);
13146 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13147 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013148 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013149 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013150 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013151 if (!value) {
13152 Py_DECREF(key);
13153 goto err;
13154 }
Georg Brandlceee0772007-11-27 23:48:05 +000013155 res = PyDict_SetItem(new, key, value);
13156 Py_DECREF(key);
13157 Py_DECREF(value);
13158 if (res < 0)
13159 goto err;
13160 }
13161 /* create entries for deleting chars in z */
13162 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 z_kind = PyUnicode_KIND(z);
13164 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013165 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013167 if (!key)
13168 goto err;
13169 res = PyDict_SetItem(new, key, Py_None);
13170 Py_DECREF(key);
13171 if (res < 0)
13172 goto err;
13173 }
13174 }
13175 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176 int kind;
13177 void *data;
13178
Georg Brandlceee0772007-11-27 23:48:05 +000013179 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013180 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013181 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13182 "to maketrans it must be a dict");
13183 goto err;
13184 }
13185 /* copy entries into the new dict, converting string keys to int keys */
13186 while (PyDict_Next(x, &i, &key, &value)) {
13187 if (PyUnicode_Check(key)) {
13188 /* convert string keys to integer keys */
13189 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013190 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013191 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13192 "table must be of length 1");
13193 goto err;
13194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 kind = PyUnicode_KIND(key);
13196 data = PyUnicode_DATA(key);
13197 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013198 if (!newkey)
13199 goto err;
13200 res = PyDict_SetItem(new, newkey, value);
13201 Py_DECREF(newkey);
13202 if (res < 0)
13203 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013204 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013205 /* just keep integer keys */
13206 if (PyDict_SetItem(new, key, value) < 0)
13207 goto err;
13208 } else {
13209 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13210 "be strings or integers");
13211 goto err;
13212 }
13213 }
13214 }
13215 return new;
13216 err:
13217 Py_DECREF(new);
13218 return NULL;
13219}
13220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013221PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013224Return a copy of the string S in which each character has been mapped\n\
13225through the given translation table. The table must implement\n\
13226lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13227mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13228this operation raises LookupError, the character is left untouched.\n\
13229Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230
13231static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235}
13236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013237PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013240Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241
13242static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013243unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013245 if (PyUnicode_READY(self) == -1)
13246 return NULL;
13247 if (PyUnicode_IS_ASCII(self))
13248 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013249 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250}
13251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013252PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013253 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013255Pad a numeric string S with zeros on the left, to fill a field\n\
13256of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257
13258static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013259unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013261 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013262 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013263 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 int kind;
13265 void *data;
13266 Py_UCS4 chr;
13267
Martin v. Löwis18e16552006-02-15 17:27:45 +000013268 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269 return NULL;
13270
Benjamin Petersonbac79492012-01-14 13:34:47 -050013271 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273
Victor Stinnerc4b49542011-12-11 22:44:26 +010013274 if (PyUnicode_GET_LENGTH(self) >= width)
13275 return unicode_result_unchanged(self);
13276
13277 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
13279 u = pad(self, fill, 0, '0');
13280
Walter Dörwald068325e2002-04-15 13:36:47 +000013281 if (u == NULL)
13282 return NULL;
13283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013284 kind = PyUnicode_KIND(u);
13285 data = PyUnicode_DATA(u);
13286 chr = PyUnicode_READ(kind, data, fill);
13287
13288 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 PyUnicode_WRITE(kind, data, 0, chr);
13291 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292 }
13293
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013294 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013295 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297
13298#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013299static PyObject *
13300unicode__decimal2ascii(PyObject *self)
13301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013303}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013304#endif
13305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013306PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013307 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013309Return True if S starts with the specified prefix, False otherwise.\n\
13310With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013311With optional end, stop comparing S at that position.\n\
13312prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313
13314static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013315unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013316 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013318 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013319 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013320 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013321 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013322 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323
Jesus Ceaac451502011-04-20 17:09:23 +020013324 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013326 if (PyTuple_Check(subobj)) {
13327 Py_ssize_t i;
13328 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013329 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013330 if (substring == NULL)
13331 return NULL;
13332 result = tailmatch(self, substring, start, end, -1);
13333 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013334 if (result == -1)
13335 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013336 if (result) {
13337 Py_RETURN_TRUE;
13338 }
13339 }
13340 /* nothing matched */
13341 Py_RETURN_FALSE;
13342 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013343 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013344 if (substring == NULL) {
13345 if (PyErr_ExceptionMatches(PyExc_TypeError))
13346 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13347 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013349 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013350 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013352 if (result == -1)
13353 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013354 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355}
13356
13357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013358PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013361Return True if S ends with the specified suffix, False otherwise.\n\
13362With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013363With optional end, stop comparing S at that position.\n\
13364suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365
13366static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013367unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013370 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013371 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013372 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013373 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013374 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375
Jesus Ceaac451502011-04-20 17:09:23 +020013376 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013378 if (PyTuple_Check(subobj)) {
13379 Py_ssize_t i;
13380 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013381 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013383 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013385 result = tailmatch(self, substring, start, end, +1);
13386 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013387 if (result == -1)
13388 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013389 if (result) {
13390 Py_RETURN_TRUE;
13391 }
13392 }
13393 Py_RETURN_FALSE;
13394 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013395 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013396 if (substring == NULL) {
13397 if (PyErr_ExceptionMatches(PyExc_TypeError))
13398 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13399 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013401 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013402 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013403 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013404 if (result == -1)
13405 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013406 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407}
13408
Victor Stinner202fdca2012-05-07 12:47:02 +020013409Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013410_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013411{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013412 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13413 writer->data = PyUnicode_DATA(writer->buffer);
13414
13415 if (!writer->readonly) {
13416 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013417 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013418 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013419 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013420 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13421 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13422 writer->kind = PyUnicode_WCHAR_KIND;
13423 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13424
Victor Stinner8f674cc2013-04-17 23:02:17 +020013425 /* Copy-on-write mode: set buffer size to 0 so
13426 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13427 * next write. */
13428 writer->size = 0;
13429 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013430}
13431
Victor Stinnerd3f08822012-05-29 12:57:52 +020013432void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013433_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013434{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013435 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013436
13437 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013438 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013439
13440 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13441 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13442 writer->kind = PyUnicode_WCHAR_KIND;
13443 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013444}
13445
Victor Stinnerd3f08822012-05-29 12:57:52 +020013446int
13447_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13448 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013449{
13450 Py_ssize_t newlen;
13451 PyObject *newbuffer;
13452
Victor Stinnerca9381e2015-09-22 00:58:32 +020013453 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013454 assert((maxchar > writer->maxchar && length >= 0)
13455 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013456
Victor Stinner202fdca2012-05-07 12:47:02 +020013457 if (length > PY_SSIZE_T_MAX - writer->pos) {
13458 PyErr_NoMemory();
13459 return -1;
13460 }
13461 newlen = writer->pos + length;
13462
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013463 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013464
Victor Stinnerd3f08822012-05-29 12:57:52 +020013465 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013466 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013467 if (writer->overallocate
13468 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13469 /* overallocate to limit the number of realloc() */
13470 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013471 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013472 if (newlen < writer->min_length)
13473 newlen = writer->min_length;
13474
Victor Stinnerd3f08822012-05-29 12:57:52 +020013475 writer->buffer = PyUnicode_New(newlen, maxchar);
13476 if (writer->buffer == NULL)
13477 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013478 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013479 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013480 if (writer->overallocate
13481 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13482 /* overallocate to limit the number of realloc() */
13483 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013485 if (newlen < writer->min_length)
13486 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013488 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013489 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013490 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013491 newbuffer = PyUnicode_New(newlen, maxchar);
13492 if (newbuffer == NULL)
13493 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13495 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013496 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013497 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013498 }
13499 else {
13500 newbuffer = resize_compact(writer->buffer, newlen);
13501 if (newbuffer == NULL)
13502 return -1;
13503 }
13504 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013505 }
13506 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013507 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013508 newbuffer = PyUnicode_New(writer->size, maxchar);
13509 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013510 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013511 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13512 writer->buffer, 0, writer->pos);
13513 Py_DECREF(writer->buffer);
13514 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013515 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013516 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013517 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013518
13519#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013520}
13521
Victor Stinnerca9381e2015-09-22 00:58:32 +020013522int
13523_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13524 enum PyUnicode_Kind kind)
13525{
13526 Py_UCS4 maxchar;
13527
13528 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13529 assert(writer->kind < kind);
13530
13531 switch (kind)
13532 {
13533 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13534 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13535 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13536 default:
13537 assert(0 && "invalid kind");
13538 return -1;
13539 }
13540
13541 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13542}
13543
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013544Py_LOCAL_INLINE(int)
13545_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013546{
13547 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13548 return -1;
13549 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13550 writer->pos++;
13551 return 0;
13552}
13553
13554int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013555_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13556{
13557 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13558}
13559
13560int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13562{
13563 Py_UCS4 maxchar;
13564 Py_ssize_t len;
13565
13566 if (PyUnicode_READY(str) == -1)
13567 return -1;
13568 len = PyUnicode_GET_LENGTH(str);
13569 if (len == 0)
13570 return 0;
13571 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13572 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013573 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013574 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013575 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013576 Py_INCREF(str);
13577 writer->buffer = str;
13578 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 writer->pos += len;
13580 return 0;
13581 }
13582 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13583 return -1;
13584 }
13585 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13586 str, 0, len);
13587 writer->pos += len;
13588 return 0;
13589}
13590
Victor Stinnere215d962012-10-06 23:03:36 +020013591int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013592_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13593 Py_ssize_t start, Py_ssize_t end)
13594{
13595 Py_UCS4 maxchar;
13596 Py_ssize_t len;
13597
13598 if (PyUnicode_READY(str) == -1)
13599 return -1;
13600
13601 assert(0 <= start);
13602 assert(end <= PyUnicode_GET_LENGTH(str));
13603 assert(start <= end);
13604
13605 if (end == 0)
13606 return 0;
13607
13608 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13609 return _PyUnicodeWriter_WriteStr(writer, str);
13610
13611 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13612 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13613 else
13614 maxchar = writer->maxchar;
13615 len = end - start;
13616
13617 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13618 return -1;
13619
13620 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13621 str, start, len);
13622 writer->pos += len;
13623 return 0;
13624}
13625
13626int
Victor Stinner4a587072013-11-19 12:54:53 +010013627_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13628 const char *ascii, Py_ssize_t len)
13629{
13630 if (len == -1)
13631 len = strlen(ascii);
13632
13633 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13634
13635 if (writer->buffer == NULL && !writer->overallocate) {
13636 PyObject *str;
13637
13638 str = _PyUnicode_FromASCII(ascii, len);
13639 if (str == NULL)
13640 return -1;
13641
13642 writer->readonly = 1;
13643 writer->buffer = str;
13644 _PyUnicodeWriter_Update(writer);
13645 writer->pos += len;
13646 return 0;
13647 }
13648
13649 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13650 return -1;
13651
13652 switch (writer->kind)
13653 {
13654 case PyUnicode_1BYTE_KIND:
13655 {
13656 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13657 Py_UCS1 *data = writer->data;
13658
13659 Py_MEMCPY(data + writer->pos, str, len);
13660 break;
13661 }
13662 case PyUnicode_2BYTE_KIND:
13663 {
13664 _PyUnicode_CONVERT_BYTES(
13665 Py_UCS1, Py_UCS2,
13666 ascii, ascii + len,
13667 (Py_UCS2 *)writer->data + writer->pos);
13668 break;
13669 }
13670 case PyUnicode_4BYTE_KIND:
13671 {
13672 _PyUnicode_CONVERT_BYTES(
13673 Py_UCS1, Py_UCS4,
13674 ascii, ascii + len,
13675 (Py_UCS4 *)writer->data + writer->pos);
13676 break;
13677 }
13678 default:
13679 assert(0);
13680 }
13681
13682 writer->pos += len;
13683 return 0;
13684}
13685
13686int
13687_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13688 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013689{
13690 Py_UCS4 maxchar;
13691
13692 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13693 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13694 return -1;
13695 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13696 writer->pos += len;
13697 return 0;
13698}
13699
Victor Stinnerd3f08822012-05-29 12:57:52 +020013700PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013701_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013702{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013703 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013704 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013705 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013706 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013707 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013708 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013709 str = writer->buffer;
13710 writer->buffer = NULL;
13711 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13712 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013713 }
13714 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13715 PyObject *newbuffer;
13716 newbuffer = resize_compact(writer->buffer, writer->pos);
13717 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013718 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013719 return NULL;
13720 }
13721 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013722 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013723 str = writer->buffer;
13724 writer->buffer = NULL;
13725 assert(_PyUnicode_CheckConsistency(str, 1));
13726 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013727}
13728
Victor Stinnerd3f08822012-05-29 12:57:52 +020013729void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013730_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013731{
13732 Py_CLEAR(writer->buffer);
13733}
13734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013736
13737PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013739\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013740Return a formatted version of S, using substitutions from args and kwargs.\n\
13741The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013742
Eric Smith27bbca62010-11-04 17:06:58 +000013743PyDoc_STRVAR(format_map__doc__,
13744 "S.format_map(mapping) -> str\n\
13745\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013746Return a formatted version of S, using substitutions from mapping.\n\
13747The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013748
Eric Smith4a7d76d2008-05-30 18:10:19 +000013749static PyObject *
13750unicode__format__(PyObject* self, PyObject* args)
13751{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013752 PyObject *format_spec;
13753 _PyUnicodeWriter writer;
13754 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013755
13756 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13757 return NULL;
13758
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 if (PyUnicode_READY(self) == -1)
13760 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013761 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013762 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13763 self, format_spec, 0,
13764 PyUnicode_GET_LENGTH(format_spec));
13765 if (ret == -1) {
13766 _PyUnicodeWriter_Dealloc(&writer);
13767 return NULL;
13768 }
13769 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013770}
13771
Eric Smith8c663262007-08-25 02:26:07 +000013772PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013773 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013774\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013775Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013776
13777static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013778unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013780 Py_ssize_t size;
13781
13782 /* If it's a compact object, account for base structure +
13783 character data. */
13784 if (PyUnicode_IS_COMPACT_ASCII(v))
13785 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13786 else if (PyUnicode_IS_COMPACT(v))
13787 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013788 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013789 else {
13790 /* If it is a two-block object, account for base object, and
13791 for character block if present. */
13792 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013793 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013794 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013795 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 }
13797 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013798 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013799 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013801 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013802 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013803
13804 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013805}
13806
13807PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013809
13810static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013811unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013812{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013813 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013814 if (!copy)
13815 return NULL;
13816 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013817}
13818
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013820 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013821 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013822 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13823 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013824 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13825 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013826 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013827 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13828 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13829 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013830 {"expandtabs", (PyCFunction) unicode_expandtabs,
13831 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013832 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013833 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013834 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13835 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13836 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013837 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013838 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13839 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13840 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013841 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013842 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013843 {"splitlines", (PyCFunction) unicode_splitlines,
13844 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013845 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013846 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13847 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13848 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13849 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13850 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13851 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13852 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13853 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13854 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13855 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13856 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13857 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13858 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13859 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013860 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013861 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013862 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013863 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013864 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013865 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013866 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013867 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013868#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013869 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013870 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871#endif
13872
Benjamin Peterson14339b62009-01-31 16:36:08 +000013873 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013874 {NULL, NULL}
13875};
13876
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013877static PyObject *
13878unicode_mod(PyObject *v, PyObject *w)
13879{
Brian Curtindfc80e32011-08-10 20:28:54 -050013880 if (!PyUnicode_Check(v))
13881 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013883}
13884
13885static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013886 0, /*nb_add*/
13887 0, /*nb_subtract*/
13888 0, /*nb_multiply*/
13889 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013890};
13891
Guido van Rossumd57fd912000-03-10 22:53:23 +000013892static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013893 (lenfunc) unicode_length, /* sq_length */
13894 PyUnicode_Concat, /* sq_concat */
13895 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13896 (ssizeargfunc) unicode_getitem, /* sq_item */
13897 0, /* sq_slice */
13898 0, /* sq_ass_item */
13899 0, /* sq_ass_slice */
13900 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013901};
13902
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013903static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013904unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013906 if (PyUnicode_READY(self) == -1)
13907 return NULL;
13908
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013909 if (PyIndex_Check(item)) {
13910 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013911 if (i == -1 && PyErr_Occurred())
13912 return NULL;
13913 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013914 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013915 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013916 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013917 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013918 PyObject *result;
13919 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013920 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013921 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013923 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013925 return NULL;
13926 }
13927
13928 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013929 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013930 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013931 slicelength == PyUnicode_GET_LENGTH(self)) {
13932 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013933 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013934 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013935 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013936 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013937 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013938 src_kind = PyUnicode_KIND(self);
13939 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013940 if (!PyUnicode_IS_ASCII(self)) {
13941 kind_limit = kind_maxchar_limit(src_kind);
13942 max_char = 0;
13943 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13944 ch = PyUnicode_READ(src_kind, src_data, cur);
13945 if (ch > max_char) {
13946 max_char = ch;
13947 if (max_char >= kind_limit)
13948 break;
13949 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013950 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013951 }
Victor Stinner55c99112011-10-13 01:17:06 +020013952 else
13953 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013954 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013955 if (result == NULL)
13956 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013957 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013958 dest_data = PyUnicode_DATA(result);
13959
13960 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013961 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13962 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013963 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013964 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013965 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013966 } else {
13967 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13968 return NULL;
13969 }
13970}
13971
13972static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 (lenfunc)unicode_length, /* mp_length */
13974 (binaryfunc)unicode_subscript, /* mp_subscript */
13975 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013976};
13977
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978
Guido van Rossumd57fd912000-03-10 22:53:23 +000013979/* Helpers for PyUnicode_Format() */
13980
Victor Stinnera47082312012-10-04 02:19:54 +020013981struct unicode_formatter_t {
13982 PyObject *args;
13983 int args_owned;
13984 Py_ssize_t arglen, argidx;
13985 PyObject *dict;
13986
13987 enum PyUnicode_Kind fmtkind;
13988 Py_ssize_t fmtcnt, fmtpos;
13989 void *fmtdata;
13990 PyObject *fmtstr;
13991
13992 _PyUnicodeWriter writer;
13993};
13994
13995struct unicode_format_arg_t {
13996 Py_UCS4 ch;
13997 int flags;
13998 Py_ssize_t width;
13999 int prec;
14000 int sign;
14001};
14002
Guido van Rossumd57fd912000-03-10 22:53:23 +000014003static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014004unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005{
Victor Stinnera47082312012-10-04 02:19:54 +020014006 Py_ssize_t argidx = ctx->argidx;
14007
14008 if (argidx < ctx->arglen) {
14009 ctx->argidx++;
14010 if (ctx->arglen < 0)
14011 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 else
Victor Stinnera47082312012-10-04 02:19:54 +020014013 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014014 }
14015 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014016 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014017 return NULL;
14018}
14019
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014020/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014021
Victor Stinnera47082312012-10-04 02:19:54 +020014022/* Format a float into the writer if the writer is not NULL, or into *p_output
14023 otherwise.
14024
14025 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014026static int
Victor Stinnera47082312012-10-04 02:19:54 +020014027formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14028 PyObject **p_output,
14029 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014030{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014031 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014033 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014034 int prec;
14035 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014036
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037 x = PyFloat_AsDouble(v);
14038 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014039 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014040
Victor Stinnera47082312012-10-04 02:19:54 +020014041 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014042 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014043 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014044
Victor Stinnera47082312012-10-04 02:19:54 +020014045 if (arg->flags & F_ALT)
14046 dtoa_flags = Py_DTSF_ALT;
14047 else
14048 dtoa_flags = 0;
14049 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014050 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014051 return -1;
14052 len = strlen(p);
14053 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014054 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014055 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014056 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014057 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014058 }
14059 else
14060 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014061 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014062 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063}
14064
Victor Stinnerd0880d52012-04-27 23:40:13 +020014065/* formatlong() emulates the format codes d, u, o, x and X, and
14066 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14067 * Python's regular ints.
14068 * Return value: a new PyUnicodeObject*, or NULL if error.
14069 * The output string is of the form
14070 * "-"? ("0x" | "0X")? digit+
14071 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14072 * set in flags. The case of hex digits will be correct,
14073 * There will be at least prec digits, zero-filled on the left if
14074 * necessary to get that many.
14075 * val object to be converted
14076 * flags bitmask of format flags; only F_ALT is looked at
14077 * prec minimum number of digits; 0-fill on left if needed
14078 * type a character in [duoxX]; u acts the same as d
14079 *
14080 * CAUTION: o, x and X conversions on regular ints can never
14081 * produce a '-' sign, but can for Python's unbounded ints.
14082 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014083PyObject *
14084_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014085{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014086 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014088 Py_ssize_t i;
14089 int sign; /* 1 if '-', else 0 */
14090 int len; /* number of characters */
14091 Py_ssize_t llen;
14092 int numdigits; /* len == numnondigits + numdigits */
14093 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014094
Victor Stinnerd0880d52012-04-27 23:40:13 +020014095 /* Avoid exceeding SSIZE_T_MAX */
14096 if (prec > INT_MAX-3) {
14097 PyErr_SetString(PyExc_OverflowError,
14098 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014100 }
14101
14102 assert(PyLong_Check(val));
14103
14104 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105 default:
14106 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014107 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014108 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014109 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014110 /* int and int subclasses should print numerically when a numeric */
14111 /* format code is used (see issue18780) */
14112 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014113 break;
14114 case 'o':
14115 numnondigits = 2;
14116 result = PyNumber_ToBase(val, 8);
14117 break;
14118 case 'x':
14119 case 'X':
14120 numnondigits = 2;
14121 result = PyNumber_ToBase(val, 16);
14122 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014123 }
14124 if (!result)
14125 return NULL;
14126
14127 assert(unicode_modifiable(result));
14128 assert(PyUnicode_IS_READY(result));
14129 assert(PyUnicode_IS_ASCII(result));
14130
14131 /* To modify the string in-place, there can only be one reference. */
14132 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014133 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014134 PyErr_BadInternalCall();
14135 return NULL;
14136 }
14137 buf = PyUnicode_DATA(result);
14138 llen = PyUnicode_GET_LENGTH(result);
14139 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014140 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014141 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014142 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014143 return NULL;
14144 }
14145 len = (int)llen;
14146 sign = buf[0] == '-';
14147 numnondigits += sign;
14148 numdigits = len - numnondigits;
14149 assert(numdigits > 0);
14150
14151 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014152 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014153 (type == 'o' || type == 'x' || type == 'X'))) {
14154 assert(buf[sign] == '0');
14155 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14156 buf[sign+1] == 'o');
14157 numnondigits -= 2;
14158 buf += 2;
14159 len -= 2;
14160 if (sign)
14161 buf[0] = '-';
14162 assert(len == numnondigits + numdigits);
14163 assert(numdigits > 0);
14164 }
14165
14166 /* Fill with leading zeroes to meet minimum width. */
14167 if (prec > numdigits) {
14168 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14169 numnondigits + prec);
14170 char *b1;
14171 if (!r1) {
14172 Py_DECREF(result);
14173 return NULL;
14174 }
14175 b1 = PyBytes_AS_STRING(r1);
14176 for (i = 0; i < numnondigits; ++i)
14177 *b1++ = *buf++;
14178 for (i = 0; i < prec - numdigits; i++)
14179 *b1++ = '0';
14180 for (i = 0; i < numdigits; i++)
14181 *b1++ = *buf++;
14182 *b1 = '\0';
14183 Py_DECREF(result);
14184 result = r1;
14185 buf = PyBytes_AS_STRING(result);
14186 len = numnondigits + prec;
14187 }
14188
14189 /* Fix up case for hex conversions. */
14190 if (type == 'X') {
14191 /* Need to convert all lower case letters to upper case.
14192 and need to convert 0x to 0X (and -0x to -0X). */
14193 for (i = 0; i < len; i++)
14194 if (buf[i] >= 'a' && buf[i] <= 'x')
14195 buf[i] -= 'a'-'A';
14196 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014197 if (!PyUnicode_Check(result)
14198 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014200 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 Py_DECREF(result);
14202 result = unicode;
14203 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014204 else if (len != PyUnicode_GET_LENGTH(result)) {
14205 if (PyUnicode_Resize(&result, len) < 0)
14206 Py_CLEAR(result);
14207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014208 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014209}
14210
Ethan Furmandf3ed242014-01-05 06:50:30 -080014211/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014212 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014213 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014214 * -1 and raise an exception on error */
14215static int
Victor Stinnera47082312012-10-04 02:19:54 +020014216mainformatlong(PyObject *v,
14217 struct unicode_format_arg_t *arg,
14218 PyObject **p_output,
14219 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014220{
14221 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014222 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014223
14224 if (!PyNumber_Check(v))
14225 goto wrongtype;
14226
Ethan Furman9ab74802014-03-21 06:38:46 -070014227 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014228 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014229 if (type == 'o' || type == 'x' || type == 'X') {
14230 iobj = PyNumber_Index(v);
14231 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014232 if (PyErr_ExceptionMatches(PyExc_TypeError))
14233 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014234 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014235 }
14236 }
14237 else {
14238 iobj = PyNumber_Long(v);
14239 if (iobj == NULL ) {
14240 if (PyErr_ExceptionMatches(PyExc_TypeError))
14241 goto wrongtype;
14242 return -1;
14243 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014244 }
14245 assert(PyLong_Check(iobj));
14246 }
14247 else {
14248 iobj = v;
14249 Py_INCREF(iobj);
14250 }
14251
14252 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014253 && arg->width == -1 && arg->prec == -1
14254 && !(arg->flags & (F_SIGN | F_BLANK))
14255 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014256 {
14257 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014258 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014259 int base;
14260
Victor Stinnera47082312012-10-04 02:19:54 +020014261 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014262 {
14263 default:
14264 assert(0 && "'type' not in [diuoxX]");
14265 case 'd':
14266 case 'i':
14267 case 'u':
14268 base = 10;
14269 break;
14270 case 'o':
14271 base = 8;
14272 break;
14273 case 'x':
14274 case 'X':
14275 base = 16;
14276 break;
14277 }
14278
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014279 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14280 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014282 }
14283 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 return 1;
14285 }
14286
Ethan Furmanb95b5612015-01-23 20:05:18 -080014287 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 Py_DECREF(iobj);
14289 if (res == NULL)
14290 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014291 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014292 return 0;
14293
14294wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014295 switch(type)
14296 {
14297 case 'o':
14298 case 'x':
14299 case 'X':
14300 PyErr_Format(PyExc_TypeError,
14301 "%%%c format: an integer is required, "
14302 "not %.200s",
14303 type, Py_TYPE(v)->tp_name);
14304 break;
14305 default:
14306 PyErr_Format(PyExc_TypeError,
14307 "%%%c format: a number is required, "
14308 "not %.200s",
14309 type, Py_TYPE(v)->tp_name);
14310 break;
14311 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014312 return -1;
14313}
14314
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014315static Py_UCS4
14316formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014317{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014318 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014319 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014320 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014321 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014322 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014323 goto onError;
14324 }
14325 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014326 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014327 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014328 /* make sure number is a type of integer */
14329 if (!PyLong_Check(v)) {
14330 iobj = PyNumber_Index(v);
14331 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014332 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014333 }
14334 v = iobj;
14335 Py_DECREF(iobj);
14336 }
14337 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014338 x = PyLong_AsLong(v);
14339 if (x == -1 && PyErr_Occurred())
14340 goto onError;
14341
Victor Stinner8faf8212011-12-08 22:14:11 +010014342 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014343 PyErr_SetString(PyExc_OverflowError,
14344 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014345 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014346 }
14347
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014348 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014349 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014350
Benjamin Peterson29060642009-01-31 22:14:21 +000014351 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014352 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014353 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014354 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355}
14356
Victor Stinnera47082312012-10-04 02:19:54 +020014357/* Parse options of an argument: flags, width, precision.
14358 Handle also "%(name)" syntax.
14359
14360 Return 0 if the argument has been formatted into arg->str.
14361 Return 1 if the argument has been written into ctx->writer,
14362 Raise an exception and return -1 on error. */
14363static int
14364unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14365 struct unicode_format_arg_t *arg)
14366{
14367#define FORMAT_READ(ctx) \
14368 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14369
14370 PyObject *v;
14371
Victor Stinnera47082312012-10-04 02:19:54 +020014372 if (arg->ch == '(') {
14373 /* Get argument value from a dictionary. Example: "%(name)s". */
14374 Py_ssize_t keystart;
14375 Py_ssize_t keylen;
14376 PyObject *key;
14377 int pcount = 1;
14378
14379 if (ctx->dict == NULL) {
14380 PyErr_SetString(PyExc_TypeError,
14381 "format requires a mapping");
14382 return -1;
14383 }
14384 ++ctx->fmtpos;
14385 --ctx->fmtcnt;
14386 keystart = ctx->fmtpos;
14387 /* Skip over balanced parentheses */
14388 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14389 arg->ch = FORMAT_READ(ctx);
14390 if (arg->ch == ')')
14391 --pcount;
14392 else if (arg->ch == '(')
14393 ++pcount;
14394 ctx->fmtpos++;
14395 }
14396 keylen = ctx->fmtpos - keystart - 1;
14397 if (ctx->fmtcnt < 0 || pcount > 0) {
14398 PyErr_SetString(PyExc_ValueError,
14399 "incomplete format key");
14400 return -1;
14401 }
14402 key = PyUnicode_Substring(ctx->fmtstr,
14403 keystart, keystart + keylen);
14404 if (key == NULL)
14405 return -1;
14406 if (ctx->args_owned) {
14407 Py_DECREF(ctx->args);
14408 ctx->args_owned = 0;
14409 }
14410 ctx->args = PyObject_GetItem(ctx->dict, key);
14411 Py_DECREF(key);
14412 if (ctx->args == NULL)
14413 return -1;
14414 ctx->args_owned = 1;
14415 ctx->arglen = -1;
14416 ctx->argidx = -2;
14417 }
14418
14419 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014420 while (--ctx->fmtcnt >= 0) {
14421 arg->ch = FORMAT_READ(ctx);
14422 ctx->fmtpos++;
14423 switch (arg->ch) {
14424 case '-': arg->flags |= F_LJUST; continue;
14425 case '+': arg->flags |= F_SIGN; continue;
14426 case ' ': arg->flags |= F_BLANK; continue;
14427 case '#': arg->flags |= F_ALT; continue;
14428 case '0': arg->flags |= F_ZERO; continue;
14429 }
14430 break;
14431 }
14432
14433 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014434 if (arg->ch == '*') {
14435 v = unicode_format_getnextarg(ctx);
14436 if (v == NULL)
14437 return -1;
14438 if (!PyLong_Check(v)) {
14439 PyErr_SetString(PyExc_TypeError,
14440 "* wants int");
14441 return -1;
14442 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014443 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014444 if (arg->width == -1 && PyErr_Occurred())
14445 return -1;
14446 if (arg->width < 0) {
14447 arg->flags |= F_LJUST;
14448 arg->width = -arg->width;
14449 }
14450 if (--ctx->fmtcnt >= 0) {
14451 arg->ch = FORMAT_READ(ctx);
14452 ctx->fmtpos++;
14453 }
14454 }
14455 else if (arg->ch >= '0' && arg->ch <= '9') {
14456 arg->width = arg->ch - '0';
14457 while (--ctx->fmtcnt >= 0) {
14458 arg->ch = FORMAT_READ(ctx);
14459 ctx->fmtpos++;
14460 if (arg->ch < '0' || arg->ch > '9')
14461 break;
14462 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14463 mixing signed and unsigned comparison. Since arg->ch is between
14464 '0' and '9', casting to int is safe. */
14465 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14466 PyErr_SetString(PyExc_ValueError,
14467 "width too big");
14468 return -1;
14469 }
14470 arg->width = arg->width*10 + (arg->ch - '0');
14471 }
14472 }
14473
14474 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014475 if (arg->ch == '.') {
14476 arg->prec = 0;
14477 if (--ctx->fmtcnt >= 0) {
14478 arg->ch = FORMAT_READ(ctx);
14479 ctx->fmtpos++;
14480 }
14481 if (arg->ch == '*') {
14482 v = unicode_format_getnextarg(ctx);
14483 if (v == NULL)
14484 return -1;
14485 if (!PyLong_Check(v)) {
14486 PyErr_SetString(PyExc_TypeError,
14487 "* wants int");
14488 return -1;
14489 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014490 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014491 if (arg->prec == -1 && PyErr_Occurred())
14492 return -1;
14493 if (arg->prec < 0)
14494 arg->prec = 0;
14495 if (--ctx->fmtcnt >= 0) {
14496 arg->ch = FORMAT_READ(ctx);
14497 ctx->fmtpos++;
14498 }
14499 }
14500 else if (arg->ch >= '0' && arg->ch <= '9') {
14501 arg->prec = arg->ch - '0';
14502 while (--ctx->fmtcnt >= 0) {
14503 arg->ch = FORMAT_READ(ctx);
14504 ctx->fmtpos++;
14505 if (arg->ch < '0' || arg->ch > '9')
14506 break;
14507 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14508 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014509 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014510 return -1;
14511 }
14512 arg->prec = arg->prec*10 + (arg->ch - '0');
14513 }
14514 }
14515 }
14516
14517 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14518 if (ctx->fmtcnt >= 0) {
14519 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14520 if (--ctx->fmtcnt >= 0) {
14521 arg->ch = FORMAT_READ(ctx);
14522 ctx->fmtpos++;
14523 }
14524 }
14525 }
14526 if (ctx->fmtcnt < 0) {
14527 PyErr_SetString(PyExc_ValueError,
14528 "incomplete format");
14529 return -1;
14530 }
14531 return 0;
14532
14533#undef FORMAT_READ
14534}
14535
14536/* Format one argument. Supported conversion specifiers:
14537
14538 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014539 - "i", "d", "u": int or float
14540 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014541 - "e", "E", "f", "F", "g", "G": float
14542 - "c": int or str (1 character)
14543
Victor Stinner8dbd4212012-12-04 09:30:24 +010014544 When possible, the output is written directly into the Unicode writer
14545 (ctx->writer). A string is created when padding is required.
14546
Victor Stinnera47082312012-10-04 02:19:54 +020014547 Return 0 if the argument has been formatted into *p_str,
14548 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014549 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014550static int
14551unicode_format_arg_format(struct unicode_formatter_t *ctx,
14552 struct unicode_format_arg_t *arg,
14553 PyObject **p_str)
14554{
14555 PyObject *v;
14556 _PyUnicodeWriter *writer = &ctx->writer;
14557
14558 if (ctx->fmtcnt == 0)
14559 ctx->writer.overallocate = 0;
14560
14561 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014562 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014563 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014564 return 1;
14565 }
14566
14567 v = unicode_format_getnextarg(ctx);
14568 if (v == NULL)
14569 return -1;
14570
Victor Stinnera47082312012-10-04 02:19:54 +020014571
14572 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014573 case 's':
14574 case 'r':
14575 case 'a':
14576 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14577 /* Fast path */
14578 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14579 return -1;
14580 return 1;
14581 }
14582
14583 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14584 *p_str = v;
14585 Py_INCREF(*p_str);
14586 }
14587 else {
14588 if (arg->ch == 's')
14589 *p_str = PyObject_Str(v);
14590 else if (arg->ch == 'r')
14591 *p_str = PyObject_Repr(v);
14592 else
14593 *p_str = PyObject_ASCII(v);
14594 }
14595 break;
14596
14597 case 'i':
14598 case 'd':
14599 case 'u':
14600 case 'o':
14601 case 'x':
14602 case 'X':
14603 {
14604 int ret = mainformatlong(v, arg, p_str, writer);
14605 if (ret != 0)
14606 return ret;
14607 arg->sign = 1;
14608 break;
14609 }
14610
14611 case 'e':
14612 case 'E':
14613 case 'f':
14614 case 'F':
14615 case 'g':
14616 case 'G':
14617 if (arg->width == -1 && arg->prec == -1
14618 && !(arg->flags & (F_SIGN | F_BLANK)))
14619 {
14620 /* Fast path */
14621 if (formatfloat(v, arg, NULL, writer) == -1)
14622 return -1;
14623 return 1;
14624 }
14625
14626 arg->sign = 1;
14627 if (formatfloat(v, arg, p_str, NULL) == -1)
14628 return -1;
14629 break;
14630
14631 case 'c':
14632 {
14633 Py_UCS4 ch = formatchar(v);
14634 if (ch == (Py_UCS4) -1)
14635 return -1;
14636 if (arg->width == -1 && arg->prec == -1) {
14637 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014638 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014639 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014640 return 1;
14641 }
14642 *p_str = PyUnicode_FromOrdinal(ch);
14643 break;
14644 }
14645
14646 default:
14647 PyErr_Format(PyExc_ValueError,
14648 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014649 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014650 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14651 (int)arg->ch,
14652 ctx->fmtpos - 1);
14653 return -1;
14654 }
14655 if (*p_str == NULL)
14656 return -1;
14657 assert (PyUnicode_Check(*p_str));
14658 return 0;
14659}
14660
14661static int
14662unicode_format_arg_output(struct unicode_formatter_t *ctx,
14663 struct unicode_format_arg_t *arg,
14664 PyObject *str)
14665{
14666 Py_ssize_t len;
14667 enum PyUnicode_Kind kind;
14668 void *pbuf;
14669 Py_ssize_t pindex;
14670 Py_UCS4 signchar;
14671 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014672 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014673 Py_ssize_t sublen;
14674 _PyUnicodeWriter *writer = &ctx->writer;
14675 Py_UCS4 fill;
14676
14677 fill = ' ';
14678 if (arg->sign && arg->flags & F_ZERO)
14679 fill = '0';
14680
14681 if (PyUnicode_READY(str) == -1)
14682 return -1;
14683
14684 len = PyUnicode_GET_LENGTH(str);
14685 if ((arg->width == -1 || arg->width <= len)
14686 && (arg->prec == -1 || arg->prec >= len)
14687 && !(arg->flags & (F_SIGN | F_BLANK)))
14688 {
14689 /* Fast path */
14690 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14691 return -1;
14692 return 0;
14693 }
14694
14695 /* Truncate the string for "s", "r" and "a" formats
14696 if the precision is set */
14697 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14698 if (arg->prec >= 0 && len > arg->prec)
14699 len = arg->prec;
14700 }
14701
14702 /* Adjust sign and width */
14703 kind = PyUnicode_KIND(str);
14704 pbuf = PyUnicode_DATA(str);
14705 pindex = 0;
14706 signchar = '\0';
14707 if (arg->sign) {
14708 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14709 if (ch == '-' || ch == '+') {
14710 signchar = ch;
14711 len--;
14712 pindex++;
14713 }
14714 else if (arg->flags & F_SIGN)
14715 signchar = '+';
14716 else if (arg->flags & F_BLANK)
14717 signchar = ' ';
14718 else
14719 arg->sign = 0;
14720 }
14721 if (arg->width < len)
14722 arg->width = len;
14723
14724 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014725 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014726 if (!(arg->flags & F_LJUST)) {
14727 if (arg->sign) {
14728 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014729 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014730 }
14731 else {
14732 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014733 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014734 }
14735 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014736 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14737 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014738 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014739 }
14740
Victor Stinnera47082312012-10-04 02:19:54 +020014741 buflen = arg->width;
14742 if (arg->sign && len == arg->width)
14743 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014744 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014745 return -1;
14746
14747 /* Write the sign if needed */
14748 if (arg->sign) {
14749 if (fill != ' ') {
14750 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14751 writer->pos += 1;
14752 }
14753 if (arg->width > len)
14754 arg->width--;
14755 }
14756
14757 /* Write the numeric prefix for "x", "X" and "o" formats
14758 if the alternate form is used.
14759 For example, write "0x" for the "%#x" format. */
14760 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14761 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14762 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14763 if (fill != ' ') {
14764 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14765 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14766 writer->pos += 2;
14767 pindex += 2;
14768 }
14769 arg->width -= 2;
14770 if (arg->width < 0)
14771 arg->width = 0;
14772 len -= 2;
14773 }
14774
14775 /* Pad left with the fill character if needed */
14776 if (arg->width > len && !(arg->flags & F_LJUST)) {
14777 sublen = arg->width - len;
14778 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14779 writer->pos += sublen;
14780 arg->width = len;
14781 }
14782
14783 /* If padding with spaces: write sign if needed and/or numeric prefix if
14784 the alternate form is used */
14785 if (fill == ' ') {
14786 if (arg->sign) {
14787 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14788 writer->pos += 1;
14789 }
14790 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14791 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14792 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14793 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14794 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14795 writer->pos += 2;
14796 pindex += 2;
14797 }
14798 }
14799
14800 /* Write characters */
14801 if (len) {
14802 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14803 str, pindex, len);
14804 writer->pos += len;
14805 }
14806
14807 /* Pad right with the fill character if needed */
14808 if (arg->width > len) {
14809 sublen = arg->width - len;
14810 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14811 writer->pos += sublen;
14812 }
14813 return 0;
14814}
14815
14816/* Helper of PyUnicode_Format(): format one arg.
14817 Return 0 on success, raise an exception and return -1 on error. */
14818static int
14819unicode_format_arg(struct unicode_formatter_t *ctx)
14820{
14821 struct unicode_format_arg_t arg;
14822 PyObject *str;
14823 int ret;
14824
Victor Stinner8dbd4212012-12-04 09:30:24 +010014825 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14826 arg.flags = 0;
14827 arg.width = -1;
14828 arg.prec = -1;
14829 arg.sign = 0;
14830 str = NULL;
14831
Victor Stinnera47082312012-10-04 02:19:54 +020014832 ret = unicode_format_arg_parse(ctx, &arg);
14833 if (ret == -1)
14834 return -1;
14835
14836 ret = unicode_format_arg_format(ctx, &arg, &str);
14837 if (ret == -1)
14838 return -1;
14839
14840 if (ret != 1) {
14841 ret = unicode_format_arg_output(ctx, &arg, str);
14842 Py_DECREF(str);
14843 if (ret == -1)
14844 return -1;
14845 }
14846
14847 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14848 PyErr_SetString(PyExc_TypeError,
14849 "not all arguments converted during string formatting");
14850 return -1;
14851 }
14852 return 0;
14853}
14854
Alexander Belopolsky40018472011-02-26 01:02:56 +000014855PyObject *
14856PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014857{
Victor Stinnera47082312012-10-04 02:19:54 +020014858 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014859
Guido van Rossumd57fd912000-03-10 22:53:23 +000014860 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014861 PyErr_BadInternalCall();
14862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014863 }
Victor Stinnera47082312012-10-04 02:19:54 +020014864
14865 ctx.fmtstr = PyUnicode_FromObject(format);
14866 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014867 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014868 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14869 Py_DECREF(ctx.fmtstr);
14870 return NULL;
14871 }
14872 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14873 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14874 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14875 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014876
Victor Stinner8f674cc2013-04-17 23:02:17 +020014877 _PyUnicodeWriter_Init(&ctx.writer);
14878 ctx.writer.min_length = ctx.fmtcnt + 100;
14879 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014880
Guido van Rossumd57fd912000-03-10 22:53:23 +000014881 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014882 ctx.arglen = PyTuple_Size(args);
14883 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884 }
14885 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014886 ctx.arglen = -1;
14887 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014888 }
Victor Stinnera47082312012-10-04 02:19:54 +020014889 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014890 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014891 ctx.dict = args;
14892 else
14893 ctx.dict = NULL;
14894 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014895
Victor Stinnera47082312012-10-04 02:19:54 +020014896 while (--ctx.fmtcnt >= 0) {
14897 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014898 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014899
14900 nonfmtpos = ctx.fmtpos++;
14901 while (ctx.fmtcnt >= 0 &&
14902 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14903 ctx.fmtpos++;
14904 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014905 }
Victor Stinnera47082312012-10-04 02:19:54 +020014906 if (ctx.fmtcnt < 0) {
14907 ctx.fmtpos--;
14908 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014909 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014910
Victor Stinnercfc4c132013-04-03 01:48:39 +020014911 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14912 nonfmtpos, ctx.fmtpos) < 0)
14913 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014914 }
14915 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014916 ctx.fmtpos++;
14917 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014918 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014919 }
14920 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014921
Victor Stinnera47082312012-10-04 02:19:54 +020014922 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014923 PyErr_SetString(PyExc_TypeError,
14924 "not all arguments converted during string formatting");
14925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014926 }
14927
Victor Stinnera47082312012-10-04 02:19:54 +020014928 if (ctx.args_owned) {
14929 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014930 }
Victor Stinnera47082312012-10-04 02:19:54 +020014931 Py_DECREF(ctx.fmtstr);
14932 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014933
Benjamin Peterson29060642009-01-31 22:14:21 +000014934 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014935 Py_DECREF(ctx.fmtstr);
14936 _PyUnicodeWriter_Dealloc(&ctx.writer);
14937 if (ctx.args_owned) {
14938 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014939 }
14940 return NULL;
14941}
14942
Jeremy Hylton938ace62002-07-17 16:30:39 +000014943static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14945
Tim Peters6d6c1a32001-08-02 04:15:00 +000014946static PyObject *
14947unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14948{
Benjamin Peterson29060642009-01-31 22:14:21 +000014949 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014950 static char *kwlist[] = {"object", "encoding", "errors", 0};
14951 char *encoding = NULL;
14952 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014953
Benjamin Peterson14339b62009-01-31 16:36:08 +000014954 if (type != &PyUnicode_Type)
14955 return unicode_subtype_new(type, args, kwds);
14956 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014957 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014958 return NULL;
14959 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014960 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014961 if (encoding == NULL && errors == NULL)
14962 return PyObject_Str(x);
14963 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014964 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014965}
14966
Guido van Rossume023fe02001-08-30 03:12:59 +000014967static PyObject *
14968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014970 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014971 Py_ssize_t length, char_size;
14972 int share_wstr, share_utf8;
14973 unsigned int kind;
14974 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014975
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014977
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014978 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014979 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014980 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014981 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014982 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014983 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014984 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014985 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014986
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014987 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014988 if (self == NULL) {
14989 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 return NULL;
14991 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014992 kind = PyUnicode_KIND(unicode);
14993 length = PyUnicode_GET_LENGTH(unicode);
14994
14995 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014996#ifdef Py_DEBUG
14997 _PyUnicode_HASH(self) = -1;
14998#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014999 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015000#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015001 _PyUnicode_STATE(self).interned = 0;
15002 _PyUnicode_STATE(self).kind = kind;
15003 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015004 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015005 _PyUnicode_STATE(self).ready = 1;
15006 _PyUnicode_WSTR(self) = NULL;
15007 _PyUnicode_UTF8_LENGTH(self) = 0;
15008 _PyUnicode_UTF8(self) = NULL;
15009 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015010 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015011
15012 share_utf8 = 0;
15013 share_wstr = 0;
15014 if (kind == PyUnicode_1BYTE_KIND) {
15015 char_size = 1;
15016 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15017 share_utf8 = 1;
15018 }
15019 else if (kind == PyUnicode_2BYTE_KIND) {
15020 char_size = 2;
15021 if (sizeof(wchar_t) == 2)
15022 share_wstr = 1;
15023 }
15024 else {
15025 assert(kind == PyUnicode_4BYTE_KIND);
15026 char_size = 4;
15027 if (sizeof(wchar_t) == 4)
15028 share_wstr = 1;
15029 }
15030
15031 /* Ensure we won't overflow the length. */
15032 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15033 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015034 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015035 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015036 data = PyObject_MALLOC((length + 1) * char_size);
15037 if (data == NULL) {
15038 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015039 goto onError;
15040 }
15041
Victor Stinnerc3c74152011-10-02 20:39:55 +020015042 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043 if (share_utf8) {
15044 _PyUnicode_UTF8_LENGTH(self) = length;
15045 _PyUnicode_UTF8(self) = data;
15046 }
15047 if (share_wstr) {
15048 _PyUnicode_WSTR_LENGTH(self) = length;
15049 _PyUnicode_WSTR(self) = (wchar_t *)data;
15050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015051
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015052 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015053 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015054 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015055#ifdef Py_DEBUG
15056 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15057#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015058 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015059 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060
15061onError:
15062 Py_DECREF(unicode);
15063 Py_DECREF(self);
15064 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015065}
15066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015067PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015068"str(object='') -> str\n\
15069str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015070\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015071Create a new string object from the given object. If encoding or\n\
15072errors is specified, then the object must expose a data buffer\n\
15073that will be decoded using the given encoding and error handler.\n\
15074Otherwise, returns the result of object.__str__() (if defined)\n\
15075or repr(object).\n\
15076encoding defaults to sys.getdefaultencoding().\n\
15077errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015078
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015079static PyObject *unicode_iter(PyObject *seq);
15080
Guido van Rossumd57fd912000-03-10 22:53:23 +000015081PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015082 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 "str", /* tp_name */
15084 sizeof(PyUnicodeObject), /* tp_size */
15085 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015086 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 (destructor)unicode_dealloc, /* tp_dealloc */
15088 0, /* tp_print */
15089 0, /* tp_getattr */
15090 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015091 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 unicode_repr, /* tp_repr */
15093 &unicode_as_number, /* tp_as_number */
15094 &unicode_as_sequence, /* tp_as_sequence */
15095 &unicode_as_mapping, /* tp_as_mapping */
15096 (hashfunc) unicode_hash, /* tp_hash*/
15097 0, /* tp_call*/
15098 (reprfunc) unicode_str, /* tp_str */
15099 PyObject_GenericGetAttr, /* tp_getattro */
15100 0, /* tp_setattro */
15101 0, /* tp_as_buffer */
15102 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015103 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015104 unicode_doc, /* tp_doc */
15105 0, /* tp_traverse */
15106 0, /* tp_clear */
15107 PyUnicode_RichCompare, /* tp_richcompare */
15108 0, /* tp_weaklistoffset */
15109 unicode_iter, /* tp_iter */
15110 0, /* tp_iternext */
15111 unicode_methods, /* tp_methods */
15112 0, /* tp_members */
15113 0, /* tp_getset */
15114 &PyBaseObject_Type, /* tp_base */
15115 0, /* tp_dict */
15116 0, /* tp_descr_get */
15117 0, /* tp_descr_set */
15118 0, /* tp_dictoffset */
15119 0, /* tp_init */
15120 0, /* tp_alloc */
15121 unicode_new, /* tp_new */
15122 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015123};
15124
15125/* Initialize the Unicode implementation */
15126
Victor Stinner3a50e702011-10-18 21:21:00 +020015127int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015128{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015129 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015130 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015131 0x000A, /* LINE FEED */
15132 0x000D, /* CARRIAGE RETURN */
15133 0x001C, /* FILE SEPARATOR */
15134 0x001D, /* GROUP SEPARATOR */
15135 0x001E, /* RECORD SEPARATOR */
15136 0x0085, /* NEXT LINE */
15137 0x2028, /* LINE SEPARATOR */
15138 0x2029, /* PARAGRAPH SEPARATOR */
15139 };
15140
Fred Drakee4315f52000-05-09 19:53:39 +000015141 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015142 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015143 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015144 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015145 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015146
Guido van Rossumcacfc072002-05-24 19:01:59 +000015147 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015148 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015149
15150 /* initialize the linebreak bloom filter */
15151 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015152 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015153 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015154
Christian Heimes26532f72013-07-20 14:57:16 +020015155 if (PyType_Ready(&EncodingMapType) < 0)
15156 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015157
Benjamin Petersonc4311282012-10-30 23:21:10 -040015158 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15159 Py_FatalError("Can't initialize field name iterator type");
15160
15161 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15162 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015163
Victor Stinner3a50e702011-10-18 21:21:00 +020015164 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015165}
15166
15167/* Finalize the Unicode implementation */
15168
Christian Heimesa156e092008-02-16 07:38:31 +000015169int
15170PyUnicode_ClearFreeList(void)
15171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015172 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015173}
15174
Guido van Rossumd57fd912000-03-10 22:53:23 +000015175void
Thomas Wouters78890102000-07-22 19:25:51 +000015176_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015178 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015179
Serhiy Storchaka05997252013-01-26 12:14:02 +020015180 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015181
Serhiy Storchaka05997252013-01-26 12:14:02 +020015182 for (i = 0; i < 256; i++)
15183 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015184 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015185 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015186}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015187
Walter Dörwald16807132007-05-25 13:52:07 +000015188void
15189PyUnicode_InternInPlace(PyObject **p)
15190{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015191 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015193#ifdef Py_DEBUG
15194 assert(s != NULL);
15195 assert(_PyUnicode_CHECK(s));
15196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015198 return;
15199#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015200 /* If it's a subclass, we don't really know what putting
15201 it in the interned dict might do. */
15202 if (!PyUnicode_CheckExact(s))
15203 return;
15204 if (PyUnicode_CHECK_INTERNED(s))
15205 return;
15206 if (interned == NULL) {
15207 interned = PyDict_New();
15208 if (interned == NULL) {
15209 PyErr_Clear(); /* Don't leave an exception */
15210 return;
15211 }
15212 }
15213 /* It might be that the GetItem call fails even
15214 though the key is present in the dictionary,
15215 namely when this happens during a stack overflow. */
15216 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015217 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015218 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015219
Victor Stinnerf0335102013-04-14 19:13:03 +020015220 if (t) {
15221 Py_INCREF(t);
15222 Py_DECREF(*p);
15223 *p = t;
15224 return;
15225 }
Walter Dörwald16807132007-05-25 13:52:07 +000015226
Benjamin Peterson14339b62009-01-31 16:36:08 +000015227 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015228 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 PyErr_Clear();
15230 PyThreadState_GET()->recursion_critical = 0;
15231 return;
15232 }
15233 PyThreadState_GET()->recursion_critical = 0;
15234 /* The two references in interned are not counted by refcnt.
15235 The deallocator will take care of this */
15236 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015237 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015238}
15239
15240void
15241PyUnicode_InternImmortal(PyObject **p)
15242{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015243 PyUnicode_InternInPlace(p);
15244 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015245 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 Py_INCREF(*p);
15247 }
Walter Dörwald16807132007-05-25 13:52:07 +000015248}
15249
15250PyObject *
15251PyUnicode_InternFromString(const char *cp)
15252{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 PyObject *s = PyUnicode_FromString(cp);
15254 if (s == NULL)
15255 return NULL;
15256 PyUnicode_InternInPlace(&s);
15257 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015258}
15259
Alexander Belopolsky40018472011-02-26 01:02:56 +000015260void
15261_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015262{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015264 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 Py_ssize_t i, n;
15266 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015267
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 if (interned == NULL || !PyDict_Check(interned))
15269 return;
15270 keys = PyDict_Keys(interned);
15271 if (keys == NULL || !PyList_Check(keys)) {
15272 PyErr_Clear();
15273 return;
15274 }
Walter Dörwald16807132007-05-25 13:52:07 +000015275
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15277 detector, interned unicode strings are not forcibly deallocated;
15278 rather, we give them their stolen references back, and then clear
15279 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015280
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 n = PyList_GET_SIZE(keys);
15282 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015283 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015285 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015286 if (PyUnicode_READY(s) == -1) {
15287 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015288 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015290 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 case SSTATE_NOT_INTERNED:
15292 /* XXX Shouldn't happen */
15293 break;
15294 case SSTATE_INTERNED_IMMORTAL:
15295 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015296 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 break;
15298 case SSTATE_INTERNED_MORTAL:
15299 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015300 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 break;
15302 default:
15303 Py_FatalError("Inconsistent interned string state.");
15304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015305 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 }
15307 fprintf(stderr, "total size of all interned strings: "
15308 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15309 "mortal/immortal\n", mortal_size, immortal_size);
15310 Py_DECREF(keys);
15311 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015312 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015313}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015314
15315
15316/********************* Unicode Iterator **************************/
15317
15318typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 PyObject_HEAD
15320 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015321 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015322} unicodeiterobject;
15323
15324static void
15325unicodeiter_dealloc(unicodeiterobject *it)
15326{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 _PyObject_GC_UNTRACK(it);
15328 Py_XDECREF(it->it_seq);
15329 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015330}
15331
15332static int
15333unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 Py_VISIT(it->it_seq);
15336 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015337}
15338
15339static PyObject *
15340unicodeiter_next(unicodeiterobject *it)
15341{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015342 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015343
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 assert(it != NULL);
15345 seq = it->it_seq;
15346 if (seq == NULL)
15347 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015348 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015350 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15351 int kind = PyUnicode_KIND(seq);
15352 void *data = PyUnicode_DATA(seq);
15353 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15354 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 if (item != NULL)
15356 ++it->it_index;
15357 return item;
15358 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015359
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 Py_DECREF(seq);
15361 it->it_seq = NULL;
15362 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015363}
15364
15365static PyObject *
15366unicodeiter_len(unicodeiterobject *it)
15367{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015368 Py_ssize_t len = 0;
15369 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015370 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015372}
15373
15374PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15375
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015376static PyObject *
15377unicodeiter_reduce(unicodeiterobject *it)
15378{
15379 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015380 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015381 it->it_seq, it->it_index);
15382 } else {
15383 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15384 if (u == NULL)
15385 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015386 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015387 }
15388}
15389
15390PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15391
15392static PyObject *
15393unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15394{
15395 Py_ssize_t index = PyLong_AsSsize_t(state);
15396 if (index == -1 && PyErr_Occurred())
15397 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015398 if (it->it_seq != NULL) {
15399 if (index < 0)
15400 index = 0;
15401 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15402 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15403 it->it_index = index;
15404 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015405 Py_RETURN_NONE;
15406}
15407
15408PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15409
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015411 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015412 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015413 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15414 reduce_doc},
15415 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15416 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418};
15419
15420PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15422 "str_iterator", /* tp_name */
15423 sizeof(unicodeiterobject), /* tp_basicsize */
15424 0, /* tp_itemsize */
15425 /* methods */
15426 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15427 0, /* tp_print */
15428 0, /* tp_getattr */
15429 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015430 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015431 0, /* tp_repr */
15432 0, /* tp_as_number */
15433 0, /* tp_as_sequence */
15434 0, /* tp_as_mapping */
15435 0, /* tp_hash */
15436 0, /* tp_call */
15437 0, /* tp_str */
15438 PyObject_GenericGetAttr, /* tp_getattro */
15439 0, /* tp_setattro */
15440 0, /* tp_as_buffer */
15441 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15442 0, /* tp_doc */
15443 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15444 0, /* tp_clear */
15445 0, /* tp_richcompare */
15446 0, /* tp_weaklistoffset */
15447 PyObject_SelfIter, /* tp_iter */
15448 (iternextfunc)unicodeiter_next, /* tp_iternext */
15449 unicodeiter_methods, /* tp_methods */
15450 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015451};
15452
15453static PyObject *
15454unicode_iter(PyObject *seq)
15455{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015456 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015457
Benjamin Peterson14339b62009-01-31 16:36:08 +000015458 if (!PyUnicode_Check(seq)) {
15459 PyErr_BadInternalCall();
15460 return NULL;
15461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015462 if (PyUnicode_READY(seq) == -1)
15463 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015464 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15465 if (it == NULL)
15466 return NULL;
15467 it->it_index = 0;
15468 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015469 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015470 _PyObject_GC_TRACK(it);
15471 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015472}
15473
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015474
15475size_t
15476Py_UNICODE_strlen(const Py_UNICODE *u)
15477{
15478 int res = 0;
15479 while(*u++)
15480 res++;
15481 return res;
15482}
15483
15484Py_UNICODE*
15485Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15486{
15487 Py_UNICODE *u = s1;
15488 while ((*u++ = *s2++));
15489 return s1;
15490}
15491
15492Py_UNICODE*
15493Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15494{
15495 Py_UNICODE *u = s1;
15496 while ((*u++ = *s2++))
15497 if (n-- == 0)
15498 break;
15499 return s1;
15500}
15501
15502Py_UNICODE*
15503Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15504{
15505 Py_UNICODE *u1 = s1;
15506 u1 += Py_UNICODE_strlen(u1);
15507 Py_UNICODE_strcpy(u1, s2);
15508 return s1;
15509}
15510
15511int
15512Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15513{
15514 while (*s1 && *s2 && *s1 == *s2)
15515 s1++, s2++;
15516 if (*s1 && *s2)
15517 return (*s1 < *s2) ? -1 : +1;
15518 if (*s1)
15519 return 1;
15520 if (*s2)
15521 return -1;
15522 return 0;
15523}
15524
15525int
15526Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15527{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015528 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015529 for (; n != 0; n--) {
15530 u1 = *s1;
15531 u2 = *s2;
15532 if (u1 != u2)
15533 return (u1 < u2) ? -1 : +1;
15534 if (u1 == '\0')
15535 return 0;
15536 s1++;
15537 s2++;
15538 }
15539 return 0;
15540}
15541
15542Py_UNICODE*
15543Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15544{
15545 const Py_UNICODE *p;
15546 for (p = s; *p; p++)
15547 if (*p == c)
15548 return (Py_UNICODE*)p;
15549 return NULL;
15550}
15551
15552Py_UNICODE*
15553Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15554{
15555 const Py_UNICODE *p;
15556 p = s + Py_UNICODE_strlen(s);
15557 while (p != s) {
15558 p--;
15559 if (*p == c)
15560 return (Py_UNICODE*)p;
15561 }
15562 return NULL;
15563}
Victor Stinner331ea922010-08-10 16:37:20 +000015564
Victor Stinner71133ff2010-09-01 23:43:53 +000015565Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015566PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015567{
Victor Stinner577db2c2011-10-11 22:12:48 +020015568 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015569 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015571 if (!PyUnicode_Check(unicode)) {
15572 PyErr_BadArgument();
15573 return NULL;
15574 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015575 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015576 if (u == NULL)
15577 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015578 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015579 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015580 PyErr_NoMemory();
15581 return NULL;
15582 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015583 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015584 size *= sizeof(Py_UNICODE);
15585 copy = PyMem_Malloc(size);
15586 if (copy == NULL) {
15587 PyErr_NoMemory();
15588 return NULL;
15589 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015590 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015591 return copy;
15592}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015593
Georg Brandl66c221e2010-10-14 07:04:07 +000015594/* A _string module, to export formatter_parser and formatter_field_name_split
15595 to the string.Formatter class implemented in Python. */
15596
15597static PyMethodDef _string_methods[] = {
15598 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15599 METH_O, PyDoc_STR("split the argument as a field name")},
15600 {"formatter_parser", (PyCFunction) formatter_parser,
15601 METH_O, PyDoc_STR("parse the argument as a format string")},
15602 {NULL, NULL}
15603};
15604
15605static struct PyModuleDef _string_module = {
15606 PyModuleDef_HEAD_INIT,
15607 "_string",
15608 PyDoc_STR("string helper module"),
15609 0,
15610 _string_methods,
15611 NULL,
15612 NULL,
15613 NULL,
15614 NULL
15615};
15616
15617PyMODINIT_FUNC
15618PyInit__string(void)
15619{
15620 return PyModule_Create(&_string_module);
15621}
15622
15623
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015624#ifdef __cplusplus
15625}
15626#endif