blob: e9e703f278960040575913447b7d89506ec4b060 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070045#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000047#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000048#include <windows.h>
49#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000050
Larry Hastings61272b72014-01-07 12:41:53 -080051/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080052class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080053[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080054/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080055
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000056/* --- Globals ------------------------------------------------------------
57
Serhiy Storchaka05997252013-01-26 12:14:02 +020058NOTE: In the interpreter's initialization phase, some globals are currently
59 initialized dynamically as needed. In the process Unicode objects may
60 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000061
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200131 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200138 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200139 (!PyUnicode_IS_READY(op) || \
140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
Victor Stinner910337b2011-10-03 03:20:16 +0200142/* Generic helper macro to convert characters of different types.
143 from_type and to_type have to be valid type names, begin and end
144 are pointers to the source characters which should be of type
145 "from_type *". to is a pointer of type "to_type *" and points to the
146 buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100149 to_type *_to = (to_type *)(to); \
150 const from_type *_iter = (from_type *)(begin); \
151 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200152 Py_ssize_t n = (_end) - (_iter); \
153 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200155 while (_iter < (_unrolled_end)) { \
156 _to[0] = (to_type) _iter[0]; \
157 _to[1] = (to_type) _iter[1]; \
158 _to[2] = (to_type) _iter[2]; \
159 _to[3] = (to_type) _iter[3]; \
160 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200161 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200162 while (_iter < (_end)) \
163 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200164 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200165
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200166#ifdef MS_WINDOWS
167 /* On Windows, overallocate by 50% is the best factor */
168# define OVERALLOCATE_FACTOR 2
169#else
170 /* On Linux, overallocate by 25% is the best factor */
171# define OVERALLOCATE_FACTOR 4
172#endif
173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200182static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200186
Serhiy Storchaka678db842013-01-26 12:16:36 +0200187#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 do { \
189 if (unicode_empty != NULL) \
190 Py_INCREF(unicode_empty); \
191 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200192 unicode_empty = PyUnicode_New(0, 0); \
193 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200194 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200195 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
196 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200#define _Py_RETURN_UNICODE_EMPTY() \
201 do { \
202 _Py_INCREF_UNICODE_EMPTY(); \
203 return unicode_empty; \
204 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200206/* Forward declaration */
207Py_LOCAL_INLINE(int)
208_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
209
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200210/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213/* Single character Unicode strings in the Latin-1 range are being
214 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216
Christian Heimes190d79e2008-01-30 11:58:22 +0000217/* Fast detection of the most frequent whitespace characters */
218const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000219 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000220/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000222/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x000C: * FORM FEED */
224/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 1, 1, 1, 1, 1, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000227/* case 0x001C: * FILE SEPARATOR */
228/* case 0x001D: * GROUP SEPARATOR */
229/* case 0x001E: * RECORD SEPARATOR */
230/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000231 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000232/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000233 1, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000237
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000246};
247
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200248/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200249static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200250static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100251static int unicode_modifiable(PyObject *unicode);
252
Victor Stinnerfe226c02011-10-03 03:52:20 +0200253
Alexander Belopolsky40018472011-02-26 01:02:56 +0000254static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100255_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200256static PyObject *
257_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
258static PyObject *
259_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
260
261static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000263 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100264 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000265 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
266
Alexander Belopolsky40018472011-02-26 01:02:56 +0000267static void
268raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300269 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100270 PyObject *unicode,
271 Py_ssize_t startpos, Py_ssize_t endpos,
272 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000273
Christian Heimes190d79e2008-01-30 11:58:22 +0000274/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200275static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000276 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278/* 0x000B, * LINE TABULATION */
279/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000281 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000282 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000283/* 0x001C, * FILE SEPARATOR */
284/* 0x001D, * GROUP SEPARATOR */
285/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 1, 1, 1, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000291
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000300};
301
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300302#include "clinic/unicodeobject.c.h"
303
Victor Stinner50149202015-09-22 00:26:54 +0200304typedef enum {
305 _Py_ERROR_UNKNOWN=0,
306 _Py_ERROR_STRICT,
307 _Py_ERROR_SURROGATEESCAPE,
308 _Py_ERROR_REPLACE,
309 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200310 _Py_ERROR_BACKSLASHREPLACE,
311 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200312 _Py_ERROR_XMLCHARREFREPLACE,
313 _Py_ERROR_OTHER
314} _Py_error_handler;
315
316static _Py_error_handler
317get_error_handler(const char *errors)
318{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200319 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200320 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200321 }
322 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200323 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200324 }
325 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200326 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200327 }
328 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200329 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200330 }
331 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200332 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200333 }
334 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200335 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 }
337 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200338 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200339 }
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_OTHER;
341}
342
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300343/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
344 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000345Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000346PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000347{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000348#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000350#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 /* This is actually an illegal character, so it should
352 not be passed to unichr. */
353 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000354#endif
355}
356
Victor Stinner910337b2011-10-03 03:20:16 +0200357#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200358int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100359_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200360{
361 PyASCIIObject *ascii;
362 unsigned int kind;
363
364 assert(PyUnicode_Check(op));
365
366 ascii = (PyASCIIObject *)op;
367 kind = ascii->state.kind;
368
Victor Stinnera3b334d2011-10-03 13:53:37 +0200369 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200370 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 assert(ascii->state.ready == 1);
372 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200373 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200374 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200375 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200376
Victor Stinnera41463c2011-10-04 01:05:08 +0200377 if (ascii->state.compact == 1) {
378 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200379 assert(kind == PyUnicode_1BYTE_KIND
380 || kind == PyUnicode_2BYTE_KIND
381 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200382 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200384 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100385 }
386 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200387 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
388
389 data = unicode->data.any;
390 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100391 assert(ascii->length == 0);
392 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200393 assert(ascii->state.compact == 0);
394 assert(ascii->state.ascii == 0);
395 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100396 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200397 assert(ascii->wstr != NULL);
398 assert(data == NULL);
399 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200400 }
401 else {
402 assert(kind == PyUnicode_1BYTE_KIND
403 || kind == PyUnicode_2BYTE_KIND
404 || kind == PyUnicode_4BYTE_KIND);
405 assert(ascii->state.compact == 0);
406 assert(ascii->state.ready == 1);
407 assert(data != NULL);
408 if (ascii->state.ascii) {
409 assert (compact->utf8 == data);
410 assert (compact->utf8_length == ascii->length);
411 }
412 else
413 assert (compact->utf8 != data);
414 }
415 }
416 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200417 if (
418#if SIZEOF_WCHAR_T == 2
419 kind == PyUnicode_2BYTE_KIND
420#else
421 kind == PyUnicode_4BYTE_KIND
422#endif
423 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200424 {
425 assert(ascii->wstr == data);
426 assert(compact->wstr_length == ascii->length);
427 } else
428 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200429 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200430
431 if (compact->utf8 == NULL)
432 assert(compact->utf8_length == 0);
433 if (ascii->wstr == NULL)
434 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200435 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200436 /* check that the best kind is used */
437 if (check_content && kind != PyUnicode_WCHAR_KIND)
438 {
439 Py_ssize_t i;
440 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200441 void *data;
442 Py_UCS4 ch;
443
444 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200445 for (i=0; i < ascii->length; i++)
446 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200447 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200448 if (ch > maxchar)
449 maxchar = ch;
450 }
451 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100452 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100454 assert(maxchar <= 255);
455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200456 else
457 assert(maxchar < 128);
458 }
Victor Stinner77faf692011-11-20 18:56:05 +0100459 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200460 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100461 assert(maxchar <= 0xFFFF);
462 }
463 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200464 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100465 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100466 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200467 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200468 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400469 return 1;
470}
Victor Stinner910337b2011-10-03 03:20:16 +0200471#endif
472
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100473static PyObject*
474unicode_result_wchar(PyObject *unicode)
475{
476#ifndef Py_DEBUG
477 Py_ssize_t len;
478
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100479 len = _PyUnicode_WSTR_LENGTH(unicode);
480 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100481 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200482 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100483 }
484
485 if (len == 1) {
486 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100487 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100488 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
489 Py_DECREF(unicode);
490 return latin1_char;
491 }
492 }
493
494 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200495 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 return NULL;
497 }
498#else
Victor Stinneraa771272012-10-04 02:32:58 +0200499 assert(Py_REFCNT(unicode) == 1);
500
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100501 /* don't make the result ready in debug mode to ensure that the caller
502 makes the string ready before using it */
503 assert(_PyUnicode_CheckConsistency(unicode, 1));
504#endif
505 return unicode;
506}
507
508static PyObject*
509unicode_result_ready(PyObject *unicode)
510{
511 Py_ssize_t length;
512
513 length = PyUnicode_GET_LENGTH(unicode);
514 if (length == 0) {
515 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100516 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200517 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 }
519 return unicode_empty;
520 }
521
522 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200523 void *data = PyUnicode_DATA(unicode);
524 int kind = PyUnicode_KIND(unicode);
525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100526 if (ch < 256) {
527 PyObject *latin1_char = unicode_latin1[ch];
528 if (latin1_char != NULL) {
529 if (unicode != latin1_char) {
530 Py_INCREF(latin1_char);
531 Py_DECREF(unicode);
532 }
533 return latin1_char;
534 }
535 else {
536 assert(_PyUnicode_CheckConsistency(unicode, 1));
537 Py_INCREF(unicode);
538 unicode_latin1[ch] = unicode;
539 return unicode;
540 }
541 }
542 }
543
544 assert(_PyUnicode_CheckConsistency(unicode, 1));
545 return unicode;
546}
547
548static PyObject*
549unicode_result(PyObject *unicode)
550{
551 assert(_PyUnicode_CHECK(unicode));
552 if (PyUnicode_IS_READY(unicode))
553 return unicode_result_ready(unicode);
554 else
555 return unicode_result_wchar(unicode);
556}
557
Victor Stinnerc4b49542011-12-11 22:44:26 +0100558static PyObject*
559unicode_result_unchanged(PyObject *unicode)
560{
561 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500562 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100563 return NULL;
564 Py_INCREF(unicode);
565 return unicode;
566 }
567 else
568 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100569 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100570}
571
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200572/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
573 ASCII, Latin1, UTF-8, etc. */
574static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200575backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200576 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
577{
Victor Stinnerad771582015-10-09 12:38:53 +0200578 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200579 Py_UCS4 ch;
580 enum PyUnicode_Kind kind;
581 void *data;
582
583 assert(PyUnicode_IS_READY(unicode));
584 kind = PyUnicode_KIND(unicode);
585 data = PyUnicode_DATA(unicode);
586
587 size = 0;
588 /* determine replacement size */
589 for (i = collstart; i < collend; ++i) {
590 Py_ssize_t incr;
591
592 ch = PyUnicode_READ(kind, data, i);
593 if (ch < 0x100)
594 incr = 2+2;
595 else if (ch < 0x10000)
596 incr = 2+4;
597 else {
598 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200599 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200600 }
601 if (size > PY_SSIZE_T_MAX - incr) {
602 PyErr_SetString(PyExc_OverflowError,
603 "encoded result is too long for a Python string");
604 return NULL;
605 }
606 size += incr;
607 }
608
Victor Stinnerad771582015-10-09 12:38:53 +0200609 str = _PyBytesWriter_Prepare(writer, str, size);
610 if (str == NULL)
611 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200612
613 /* generate replacement */
614 for (i = collstart; i < collend; ++i) {
615 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200616 *str++ = '\\';
617 if (ch >= 0x00010000) {
618 *str++ = 'U';
619 *str++ = Py_hexdigits[(ch>>28)&0xf];
620 *str++ = Py_hexdigits[(ch>>24)&0xf];
621 *str++ = Py_hexdigits[(ch>>20)&0xf];
622 *str++ = Py_hexdigits[(ch>>16)&0xf];
623 *str++ = Py_hexdigits[(ch>>12)&0xf];
624 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200625 }
Victor Stinner797485e2015-10-09 03:17:30 +0200626 else if (ch >= 0x100) {
627 *str++ = 'u';
628 *str++ = Py_hexdigits[(ch>>12)&0xf];
629 *str++ = Py_hexdigits[(ch>>8)&0xf];
630 }
631 else
632 *str++ = 'x';
633 *str++ = Py_hexdigits[(ch>>4)&0xf];
634 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200635 }
636 return str;
637}
638
639/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
640 ASCII, Latin1, UTF-8, etc. */
641static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200642xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200643 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
644{
Victor Stinnerad771582015-10-09 12:38:53 +0200645 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200646 Py_UCS4 ch;
647 enum PyUnicode_Kind kind;
648 void *data;
649
650 assert(PyUnicode_IS_READY(unicode));
651 kind = PyUnicode_KIND(unicode);
652 data = PyUnicode_DATA(unicode);
653
654 size = 0;
655 /* determine replacement size */
656 for (i = collstart; i < collend; ++i) {
657 Py_ssize_t incr;
658
659 ch = PyUnicode_READ(kind, data, i);
660 if (ch < 10)
661 incr = 2+1+1;
662 else if (ch < 100)
663 incr = 2+2+1;
664 else if (ch < 1000)
665 incr = 2+3+1;
666 else if (ch < 10000)
667 incr = 2+4+1;
668 else if (ch < 100000)
669 incr = 2+5+1;
670 else if (ch < 1000000)
671 incr = 2+6+1;
672 else {
673 assert(ch <= MAX_UNICODE);
674 incr = 2+7+1;
675 }
676 if (size > PY_SSIZE_T_MAX - incr) {
677 PyErr_SetString(PyExc_OverflowError,
678 "encoded result is too long for a Python string");
679 return NULL;
680 }
681 size += incr;
682 }
683
Victor Stinnerad771582015-10-09 12:38:53 +0200684 str = _PyBytesWriter_Prepare(writer, str, size);
685 if (str == NULL)
686 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200687
688 /* generate replacement */
689 for (i = collstart; i < collend; ++i) {
690 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
691 }
692 return str;
693}
694
Thomas Wouters477c8d52006-05-27 19:21:47 +0000695/* --- Bloom Filters ----------------------------------------------------- */
696
697/* stuff to implement simple "bloom filters" for Unicode characters.
698 to keep things simple, we use a single bitmask, using the least 5
699 bits from each unicode characters as the bit index. */
700
701/* the linebreak mask is set up by Unicode_Init below */
702
Antoine Pitrouf068f942010-01-13 14:19:12 +0000703#if LONG_BIT >= 128
704#define BLOOM_WIDTH 128
705#elif LONG_BIT >= 64
706#define BLOOM_WIDTH 64
707#elif LONG_BIT >= 32
708#define BLOOM_WIDTH 32
709#else
710#error "LONG_BIT is smaller than 32"
711#endif
712
Thomas Wouters477c8d52006-05-27 19:21:47 +0000713#define BLOOM_MASK unsigned long
714
Serhiy Storchaka05997252013-01-26 12:14:02 +0200715static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000716
Antoine Pitrouf068f942010-01-13 14:19:12 +0000717#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000718
Benjamin Peterson29060642009-01-31 22:14:21 +0000719#define BLOOM_LINEBREAK(ch) \
720 ((ch) < 128U ? ascii_linebreak[(ch)] : \
721 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000722
Alexander Belopolsky40018472011-02-26 01:02:56 +0000723Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000725{
Victor Stinnera85af502013-04-09 21:53:54 +0200726#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
727 do { \
728 TYPE *data = (TYPE *)PTR; \
729 TYPE *end = data + LEN; \
730 Py_UCS4 ch; \
731 for (; data != end; data++) { \
732 ch = *data; \
733 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
734 } \
735 break; \
736 } while (0)
737
Thomas Wouters477c8d52006-05-27 19:21:47 +0000738 /* calculate simple bloom-style bitmask for a given unicode string */
739
Antoine Pitrouf068f942010-01-13 14:19:12 +0000740 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000741
742 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200743 switch (kind) {
744 case PyUnicode_1BYTE_KIND:
745 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
746 break;
747 case PyUnicode_2BYTE_KIND:
748 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
749 break;
750 case PyUnicode_4BYTE_KIND:
751 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
752 break;
753 default:
754 assert(0);
755 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000756 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200757
758#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000759}
760
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300761static int
762ensure_unicode(PyObject *obj)
763{
764 if (!PyUnicode_Check(obj)) {
765 PyErr_Format(PyExc_TypeError,
766 "must be str, not %.100s",
767 Py_TYPE(obj)->tp_name);
768 return -1;
769 }
770 return PyUnicode_READY(obj);
771}
772
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200773/* Compilation of templated routines */
774
775#include "stringlib/asciilib.h"
776#include "stringlib/fastsearch.h"
777#include "stringlib/partition.h"
778#include "stringlib/split.h"
779#include "stringlib/count.h"
780#include "stringlib/find.h"
781#include "stringlib/find_max_char.h"
782#include "stringlib/localeutil.h"
783#include "stringlib/undef.h"
784
785#include "stringlib/ucs1lib.h"
786#include "stringlib/fastsearch.h"
787#include "stringlib/partition.h"
788#include "stringlib/split.h"
789#include "stringlib/count.h"
790#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300791#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200792#include "stringlib/find_max_char.h"
793#include "stringlib/localeutil.h"
794#include "stringlib/undef.h"
795
796#include "stringlib/ucs2lib.h"
797#include "stringlib/fastsearch.h"
798#include "stringlib/partition.h"
799#include "stringlib/split.h"
800#include "stringlib/count.h"
801#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300802#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200803#include "stringlib/find_max_char.h"
804#include "stringlib/localeutil.h"
805#include "stringlib/undef.h"
806
807#include "stringlib/ucs4lib.h"
808#include "stringlib/fastsearch.h"
809#include "stringlib/partition.h"
810#include "stringlib/split.h"
811#include "stringlib/count.h"
812#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300813#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200814#include "stringlib/find_max_char.h"
815#include "stringlib/localeutil.h"
816#include "stringlib/undef.h"
817
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200818#include "stringlib/unicodedefs.h"
819#include "stringlib/fastsearch.h"
820#include "stringlib/count.h"
821#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100822#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200823
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824/* --- Unicode Object ----------------------------------------------------- */
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200827fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200829Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200830 Py_ssize_t size, Py_UCS4 ch,
831 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200835 if ((Py_UCS1) ch != ch)
836 return -1;
837 if (direction > 0)
838 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
839 else
840 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200841 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200842 if ((Py_UCS2) ch != ch)
843 return -1;
844 if (direction > 0)
845 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
846 else
847 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200848 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200849 if (direction > 0)
850 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
851 else
852 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200853 default:
854 assert(0);
855 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857}
858
Victor Stinnerafffce42012-10-03 23:03:17 +0200859#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000860/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200861 earlier.
862
863 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
864 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
865 invalid character in Unicode 6.0. */
866static void
867unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
868{
869 int kind = PyUnicode_KIND(unicode);
870 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
871 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
872 if (length <= old_length)
873 return;
874 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
875}
876#endif
877
Victor Stinnerfe226c02011-10-03 03:52:20 +0200878static PyObject*
879resize_compact(PyObject *unicode, Py_ssize_t length)
880{
881 Py_ssize_t char_size;
882 Py_ssize_t struct_size;
883 Py_ssize_t new_size;
884 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100885 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200886#ifdef Py_DEBUG
887 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
888#endif
889
Victor Stinner79891572012-05-03 13:43:07 +0200890 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200891 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100892 assert(PyUnicode_IS_COMPACT(unicode));
893
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200894 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100895 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200896 struct_size = sizeof(PyASCIIObject);
897 else
898 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200899 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200900
Victor Stinnerfe226c02011-10-03 03:52:20 +0200901 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
902 PyErr_NoMemory();
903 return NULL;
904 }
905 new_size = (struct_size + (length + 1) * char_size);
906
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200907 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
908 PyObject_DEL(_PyUnicode_UTF8(unicode));
909 _PyUnicode_UTF8(unicode) = NULL;
910 _PyUnicode_UTF8_LENGTH(unicode) = 0;
911 }
Victor Stinner84def372011-12-11 20:04:56 +0100912 _Py_DEC_REFTOTAL;
913 _Py_ForgetReference(unicode);
914
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300915 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100916 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 PyErr_NoMemory();
919 return NULL;
920 }
Victor Stinner84def372011-12-11 20:04:56 +0100921 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200922 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100923
Victor Stinnerfe226c02011-10-03 03:52:20 +0200924 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200925 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200926 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100927 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200928 _PyUnicode_WSTR_LENGTH(unicode) = length;
929 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100930 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
931 PyObject_DEL(_PyUnicode_WSTR(unicode));
932 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100933 if (!PyUnicode_IS_ASCII(unicode))
934 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100935 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
937 unicode_fill_invalid(unicode, old_length);
938#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
940 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200941 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200942 return unicode;
943}
944
Alexander Belopolsky40018472011-02-26 01:02:56 +0000945static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200946resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947{
Victor Stinner95663112011-10-04 01:03:50 +0200948 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100949 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200951 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000952
Victor Stinnerfe226c02011-10-03 03:52:20 +0200953 if (PyUnicode_IS_READY(unicode)) {
954 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200955 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200957#ifdef Py_DEBUG
958 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
959#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200960
961 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200962 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200963 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
964 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200965
966 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
967 PyErr_NoMemory();
968 return -1;
969 }
970 new_size = (length + 1) * char_size;
971
Victor Stinner7a9105a2011-12-12 00:13:42 +0100972 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
973 {
974 PyObject_DEL(_PyUnicode_UTF8(unicode));
975 _PyUnicode_UTF8(unicode) = NULL;
976 _PyUnicode_UTF8_LENGTH(unicode) = 0;
977 }
978
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 data = (PyObject *)PyObject_REALLOC(data, new_size);
980 if (data == NULL) {
981 PyErr_NoMemory();
982 return -1;
983 }
984 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200985 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200986 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200987 _PyUnicode_WSTR_LENGTH(unicode) = length;
988 }
989 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200990 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200991 _PyUnicode_UTF8_LENGTH(unicode) = length;
992 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200993 _PyUnicode_LENGTH(unicode) = length;
994 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200995#ifdef Py_DEBUG
996 unicode_fill_invalid(unicode, old_length);
997#endif
Victor Stinner95663112011-10-04 01:03:50 +0200998 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200999 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001000 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001002 }
Victor Stinner95663112011-10-04 01:03:50 +02001003 assert(_PyUnicode_WSTR(unicode) != NULL);
1004
1005 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001006 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001007 PyErr_NoMemory();
1008 return -1;
1009 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001010 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001011 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001012 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001013 if (!wstr) {
1014 PyErr_NoMemory();
1015 return -1;
1016 }
1017 _PyUnicode_WSTR(unicode) = wstr;
1018 _PyUnicode_WSTR(unicode)[length] = 0;
1019 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 return 0;
1022}
1023
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024static PyObject*
1025resize_copy(PyObject *unicode, Py_ssize_t length)
1026{
1027 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001030
Benjamin Petersonbac79492012-01-14 13:34:47 -05001031 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033
1034 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1035 if (copy == NULL)
1036 return NULL;
1037
1038 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001039 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001041 }
1042 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001043 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001045 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 if (w == NULL)
1047 return NULL;
1048 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1049 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001050 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1051 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001052 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 }
1054}
1055
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001057 Ux0000 terminated; some code (e.g. new_identifier)
1058 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059
1060 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001061 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001062
1063*/
1064
Alexander Belopolsky40018472011-02-26 01:02:56 +00001065static PyUnicodeObject *
1066_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001068 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070
Thomas Wouters477c8d52006-05-27 19:21:47 +00001071 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if (length == 0 && unicode_empty != NULL) {
1073 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001074 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 }
1076
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001077 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001078 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001079 return (PyUnicodeObject *)PyErr_NoMemory();
1080 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081 if (length < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to _PyUnicode_New");
1084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085 }
1086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1088 if (unicode == NULL)
1089 return NULL;
1090 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001091
1092 _PyUnicode_WSTR_LENGTH(unicode) = length;
1093 _PyUnicode_HASH(unicode) = -1;
1094 _PyUnicode_STATE(unicode).interned = 0;
1095 _PyUnicode_STATE(unicode).kind = 0;
1096 _PyUnicode_STATE(unicode).compact = 0;
1097 _PyUnicode_STATE(unicode).ready = 0;
1098 _PyUnicode_STATE(unicode).ascii = 0;
1099 _PyUnicode_DATA_ANY(unicode) = NULL;
1100 _PyUnicode_LENGTH(unicode) = 0;
1101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1105 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001106 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001107 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001108 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110
Jeremy Hyltond8082792003-09-16 19:41:39 +00001111 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001112 * the caller fails before initializing str -- unicode_resize()
1113 * reads str[0], and the Keep-Alive optimization can keep memory
1114 * allocated for str alive across a call to unicode_dealloc(unicode).
1115 * We don't want unicode_resize to read uninitialized memory in
1116 * that case.
1117 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 _PyUnicode_WSTR(unicode)[0] = 0;
1119 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001120
Victor Stinner7931d9a2011-11-04 00:22:48 +01001121 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001122 return unicode;
1123}
1124
Victor Stinnerf42dc442011-10-02 23:33:16 +02001125static const char*
1126unicode_kind_name(PyObject *unicode)
1127{
Victor Stinner42dfd712011-10-03 14:41:45 +02001128 /* don't check consistency: unicode_kind_name() is called from
1129 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001130 if (!PyUnicode_IS_COMPACT(unicode))
1131 {
1132 if (!PyUnicode_IS_READY(unicode))
1133 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001134 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001135 {
1136 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001137 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001138 return "legacy ascii";
1139 else
1140 return "legacy latin1";
1141 case PyUnicode_2BYTE_KIND:
1142 return "legacy UCS2";
1143 case PyUnicode_4BYTE_KIND:
1144 return "legacy UCS4";
1145 default:
1146 return "<legacy invalid kind>";
1147 }
1148 }
1149 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001152 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001153 return "ascii";
1154 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001155 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001156 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001157 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001158 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001159 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 default:
1161 return "<invalid compact kind>";
1162 }
1163}
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166/* Functions wrapping macros for use in debugger */
1167char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001168 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169}
1170
1171void *_PyUnicode_compact_data(void *unicode) {
1172 return _PyUnicode_COMPACT_DATA(unicode);
1173}
1174void *_PyUnicode_data(void *unicode){
1175 printf("obj %p\n", unicode);
1176 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1177 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1178 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1179 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1180 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1181 return PyUnicode_DATA(unicode);
1182}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001183
1184void
1185_PyUnicode_Dump(PyObject *op)
1186{
1187 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001188 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1189 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1190 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001191
Victor Stinnera849a4b2011-10-03 12:12:11 +02001192 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001193 {
1194 if (ascii->state.ascii)
1195 data = (ascii + 1);
1196 else
1197 data = (compact + 1);
1198 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001199 else
1200 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001201 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1202 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001203
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 if (ascii->wstr == data)
1205 printf("shared ");
1206 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera3b334d2011-10-03 13:53:37 +02001208 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001209 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001210 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1211 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001212 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1213 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001216}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217#endif
1218
1219PyObject *
1220PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1221{
1222 PyObject *obj;
1223 PyCompactUnicodeObject *unicode;
1224 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001225 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001226 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 Py_ssize_t char_size;
1228 Py_ssize_t struct_size;
1229
1230 /* Optimization for empty strings */
1231 if (size == 0 && unicode_empty != NULL) {
1232 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001233 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 }
1235
Victor Stinner9e9d6892011-10-04 01:02:02 +02001236 is_ascii = 0;
1237 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 struct_size = sizeof(PyCompactUnicodeObject);
1239 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001240 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 char_size = 1;
1242 is_ascii = 1;
1243 struct_size = sizeof(PyASCIIObject);
1244 }
1245 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001246 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 char_size = 1;
1248 }
1249 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001250 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 char_size = 2;
1252 if (sizeof(wchar_t) == 2)
1253 is_sharing = 1;
1254 }
1255 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001256 if (maxchar > MAX_UNICODE) {
1257 PyErr_SetString(PyExc_SystemError,
1258 "invalid maximum character passed to PyUnicode_New");
1259 return NULL;
1260 }
Victor Stinner8f825062012-04-27 13:55:39 +02001261 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 char_size = 4;
1263 if (sizeof(wchar_t) == 4)
1264 is_sharing = 1;
1265 }
1266
1267 /* Ensure we won't overflow the size. */
1268 if (size < 0) {
1269 PyErr_SetString(PyExc_SystemError,
1270 "Negative size passed to PyUnicode_New");
1271 return NULL;
1272 }
1273 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1274 return PyErr_NoMemory();
1275
1276 /* Duplicated allocation code from _PyObject_New() instead of a call to
1277 * PyObject_New() so we are able to allocate space for the object and
1278 * it's data buffer.
1279 */
1280 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1281 if (obj == NULL)
1282 return PyErr_NoMemory();
1283 obj = PyObject_INIT(obj, &PyUnicode_Type);
1284 if (obj == NULL)
1285 return NULL;
1286
1287 unicode = (PyCompactUnicodeObject *)obj;
1288 if (is_ascii)
1289 data = ((PyASCIIObject*)obj) + 1;
1290 else
1291 data = unicode + 1;
1292 _PyUnicode_LENGTH(unicode) = size;
1293 _PyUnicode_HASH(unicode) = -1;
1294 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001295 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 _PyUnicode_STATE(unicode).compact = 1;
1297 _PyUnicode_STATE(unicode).ready = 1;
1298 _PyUnicode_STATE(unicode).ascii = is_ascii;
1299 if (is_ascii) {
1300 ((char*)data)[size] = 0;
1301 _PyUnicode_WSTR(unicode) = NULL;
1302 }
Victor Stinner8f825062012-04-27 13:55:39 +02001303 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 ((char*)data)[size] = 0;
1305 _PyUnicode_WSTR(unicode) = NULL;
1306 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001308 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 else {
1311 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001312 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001313 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001315 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ((Py_UCS4*)data)[size] = 0;
1317 if (is_sharing) {
1318 _PyUnicode_WSTR_LENGTH(unicode) = size;
1319 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1320 }
1321 else {
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1323 _PyUnicode_WSTR(unicode) = NULL;
1324 }
1325 }
Victor Stinner8f825062012-04-27 13:55:39 +02001326#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001327 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001328#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001329 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 return obj;
1331}
1332
1333#if SIZEOF_WCHAR_T == 2
1334/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1335 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001336 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
1338 This function assumes that unicode can hold one more code point than wstr
1339 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001340static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001342 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343{
1344 const wchar_t *iter;
1345 Py_UCS4 *ucs4_out;
1346
Victor Stinner910337b2011-10-03 03:20:16 +02001347 assert(unicode != NULL);
1348 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1350 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1351
1352 for (iter = begin; iter < end; ) {
1353 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1354 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001355 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1356 && (iter+1) < end
1357 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 {
Victor Stinner551ac952011-11-29 22:58:13 +01001359 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 iter += 2;
1361 }
1362 else {
1363 *ucs4_out++ = *iter;
1364 iter++;
1365 }
1366 }
1367 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1368 _PyUnicode_GET_LENGTH(unicode)));
1369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370}
1371#endif
1372
Victor Stinnercd9950f2011-10-02 00:34:53 +02001373static int
Victor Stinner488fa492011-12-12 00:01:39 +01001374unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001375{
Victor Stinner488fa492011-12-12 00:01:39 +01001376 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001377 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001378 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001379 return -1;
1380 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001381 return 0;
1382}
1383
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001384static int
1385_copy_characters(PyObject *to, Py_ssize_t to_start,
1386 PyObject *from, Py_ssize_t from_start,
1387 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001389 unsigned int from_kind, to_kind;
1390 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
Victor Stinneree4544c2012-05-09 22:24:08 +02001392 assert(0 <= how_many);
1393 assert(0 <= from_start);
1394 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001395 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001396 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001397 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
Victor Stinnerd3f08822012-05-29 12:57:52 +02001399 assert(PyUnicode_Check(to));
1400 assert(PyUnicode_IS_READY(to));
1401 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1402
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001403 if (how_many == 0)
1404 return 0;
1405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001407 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001409 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerf1852262012-06-16 16:38:26 +02001411#ifdef Py_DEBUG
1412 if (!check_maxchar
1413 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1414 {
1415 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1416 Py_UCS4 ch;
1417 Py_ssize_t i;
1418 for (i=0; i < how_many; i++) {
1419 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1420 assert(ch <= to_maxchar);
1421 }
1422 }
1423#endif
1424
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001425 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001426 if (check_maxchar
1427 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1428 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001429 /* Writing Latin-1 characters into an ASCII string requires to
1430 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001431 Py_UCS4 max_char;
1432 max_char = ucs1lib_find_max_char(from_data,
1433 (Py_UCS1*)from_data + how_many);
1434 if (max_char >= 128)
1435 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001436 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001437 Py_MEMCPY((char*)to_data + to_kind * to_start,
1438 (char*)from_data + from_kind * from_start,
1439 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001441 else if (from_kind == PyUnicode_1BYTE_KIND
1442 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001443 {
1444 _PyUnicode_CONVERT_BYTES(
1445 Py_UCS1, Py_UCS2,
1446 PyUnicode_1BYTE_DATA(from) + from_start,
1447 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448 PyUnicode_2BYTE_DATA(to) + to_start
1449 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001450 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001451 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001452 && to_kind == PyUnicode_4BYTE_KIND)
1453 {
1454 _PyUnicode_CONVERT_BYTES(
1455 Py_UCS1, Py_UCS4,
1456 PyUnicode_1BYTE_DATA(from) + from_start,
1457 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1458 PyUnicode_4BYTE_DATA(to) + to_start
1459 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001460 }
1461 else if (from_kind == PyUnicode_2BYTE_KIND
1462 && to_kind == PyUnicode_4BYTE_KIND)
1463 {
1464 _PyUnicode_CONVERT_BYTES(
1465 Py_UCS2, Py_UCS4,
1466 PyUnicode_2BYTE_DATA(from) + from_start,
1467 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1468 PyUnicode_4BYTE_DATA(to) + to_start
1469 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001470 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001471 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001472 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1473
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001474 if (!check_maxchar) {
1475 if (from_kind == PyUnicode_2BYTE_KIND
1476 && to_kind == PyUnicode_1BYTE_KIND)
1477 {
1478 _PyUnicode_CONVERT_BYTES(
1479 Py_UCS2, Py_UCS1,
1480 PyUnicode_2BYTE_DATA(from) + from_start,
1481 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1482 PyUnicode_1BYTE_DATA(to) + to_start
1483 );
1484 }
1485 else if (from_kind == PyUnicode_4BYTE_KIND
1486 && to_kind == PyUnicode_1BYTE_KIND)
1487 {
1488 _PyUnicode_CONVERT_BYTES(
1489 Py_UCS4, Py_UCS1,
1490 PyUnicode_4BYTE_DATA(from) + from_start,
1491 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492 PyUnicode_1BYTE_DATA(to) + to_start
1493 );
1494 }
1495 else if (from_kind == PyUnicode_4BYTE_KIND
1496 && to_kind == PyUnicode_2BYTE_KIND)
1497 {
1498 _PyUnicode_CONVERT_BYTES(
1499 Py_UCS4, Py_UCS2,
1500 PyUnicode_4BYTE_DATA(from) + from_start,
1501 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1502 PyUnicode_2BYTE_DATA(to) + to_start
1503 );
1504 }
1505 else {
1506 assert(0);
1507 return -1;
1508 }
1509 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001511 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001513 Py_ssize_t i;
1514
Victor Stinnera0702ab2011-09-29 14:14:38 +02001515 for (i=0; i < how_many; i++) {
1516 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 if (ch > to_maxchar)
1518 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1520 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001521 }
1522 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001523 return 0;
1524}
1525
Victor Stinnerd3f08822012-05-29 12:57:52 +02001526void
1527_PyUnicode_FastCopyCharacters(
1528 PyObject *to, Py_ssize_t to_start,
1529 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530{
1531 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1532}
1533
1534Py_ssize_t
1535PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1536 PyObject *from, Py_ssize_t from_start,
1537 Py_ssize_t how_many)
1538{
1539 int err;
1540
1541 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1542 PyErr_BadInternalCall();
1543 return -1;
1544 }
1545
Benjamin Petersonbac79492012-01-14 13:34:47 -05001546 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001548 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001549 return -1;
1550
Victor Stinnerd3f08822012-05-29 12:57:52 +02001551 if (from_start < 0) {
1552 PyErr_SetString(PyExc_IndexError, "string index out of range");
1553 return -1;
1554 }
1555 if (to_start < 0) {
1556 PyErr_SetString(PyExc_IndexError, "string index out of range");
1557 return -1;
1558 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1560 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1561 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001562 "Cannot write %zi characters at %zi "
1563 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 how_many, to_start, PyUnicode_GET_LENGTH(to));
1565 return -1;
1566 }
1567
1568 if (how_many == 0)
1569 return 0;
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572 return -1;
1573
1574 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1575 if (err) {
1576 PyErr_Format(PyExc_SystemError,
1577 "Cannot copy %s characters "
1578 "into a string of %s characters",
1579 unicode_kind_name(from),
1580 unicode_kind_name(to));
1581 return -1;
1582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584}
1585
Victor Stinner17222162011-09-28 22:15:37 +02001586/* Find the maximum code point and count the number of surrogate pairs so a
1587 correct string length can be computed before converting a string to UCS4.
1588 This function counts single surrogates as a character and not as a pair.
1589
1590 Return 0 on success, or -1 on error. */
1591static int
1592find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1593 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594{
1595 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001596 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerc53be962011-10-02 21:33:54 +02001598 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 *num_surrogates = 0;
1600 *maxchar = 0;
1601
1602 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001604 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1605 && (iter+1) < end
1606 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1607 {
1608 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1609 ++(*num_surrogates);
1610 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 }
1612 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001614 {
1615 ch = *iter;
1616 iter++;
1617 }
1618 if (ch > *maxchar) {
1619 *maxchar = ch;
1620 if (*maxchar > MAX_UNICODE) {
1621 PyErr_Format(PyExc_ValueError,
1622 "character U+%x is not in range [U+0000; U+10ffff]",
1623 ch);
1624 return -1;
1625 }
1626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 }
1628 return 0;
1629}
1630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001631int
1632_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633{
1634 wchar_t *end;
1635 Py_UCS4 maxchar = 0;
1636 Py_ssize_t num_surrogates;
1637#if SIZEOF_WCHAR_T == 2
1638 Py_ssize_t length_wo_surrogates;
1639#endif
1640
Georg Brandl7597add2011-10-05 16:36:47 +02001641 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001642 strings were created using _PyObject_New() and where no canonical
1643 representation (the str field) has been set yet aka strings
1644 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001645 assert(_PyUnicode_CHECK(unicode));
1646 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001648 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001649 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001650 /* Actually, it should neither be interned nor be anything else: */
1651 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001654 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001655 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657
1658 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001659 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1660 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 PyErr_NoMemory();
1662 return -1;
1663 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001664 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 _PyUnicode_WSTR(unicode), end,
1666 PyUnicode_1BYTE_DATA(unicode));
1667 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1668 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1669 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1670 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001671 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001672 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001673 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 }
1675 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001676 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001677 _PyUnicode_UTF8(unicode) = NULL;
1678 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 }
1680 PyObject_FREE(_PyUnicode_WSTR(unicode));
1681 _PyUnicode_WSTR(unicode) = NULL;
1682 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1683 }
1684 /* In this case we might have to convert down from 4-byte native
1685 wchar_t to 2-byte unicode. */
1686 else if (maxchar < 65536) {
1687 assert(num_surrogates == 0 &&
1688 "FindMaxCharAndNumSurrogatePairs() messed up");
1689
Victor Stinner506f5922011-09-28 22:34:18 +02001690#if SIZEOF_WCHAR_T == 2
1691 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001692 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001693 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1694 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1695 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001698#else
1699 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001700 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001701 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001702 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001703 PyErr_NoMemory();
1704 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705 }
Victor Stinner506f5922011-09-28 22:34:18 +02001706 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1707 _PyUnicode_WSTR(unicode), end,
1708 PyUnicode_2BYTE_DATA(unicode));
1709 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1710 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1711 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001712 _PyUnicode_UTF8(unicode) = NULL;
1713 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001714 PyObject_FREE(_PyUnicode_WSTR(unicode));
1715 _PyUnicode_WSTR(unicode) = NULL;
1716 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1717#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 }
1719 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1720 else {
1721#if SIZEOF_WCHAR_T == 2
1722 /* in case the native representation is 2-bytes, we need to allocate a
1723 new normalized 4-byte version. */
1724 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001725 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1726 PyErr_NoMemory();
1727 return -1;
1728 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001729 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1730 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 PyErr_NoMemory();
1732 return -1;
1733 }
1734 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1735 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001736 _PyUnicode_UTF8(unicode) = NULL;
1737 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001738 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1739 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001740 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 PyObject_FREE(_PyUnicode_WSTR(unicode));
1742 _PyUnicode_WSTR(unicode) = NULL;
1743 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1744#else
1745 assert(num_surrogates == 0);
1746
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001749 _PyUnicode_UTF8(unicode) = NULL;
1750 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1752#endif
1753 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1754 }
1755 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001756 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return 0;
1758}
1759
Alexander Belopolsky40018472011-02-26 01:02:56 +00001760static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001761unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762{
Walter Dörwald16807132007-05-25 13:52:07 +00001763 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 case SSTATE_NOT_INTERNED:
1765 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001766
Benjamin Peterson29060642009-01-31 22:14:21 +00001767 case SSTATE_INTERNED_MORTAL:
1768 /* revive dead object temporarily for DelItem */
1769 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001770 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 Py_FatalError(
1772 "deletion of interned string failed");
1773 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001774
Benjamin Peterson29060642009-01-31 22:14:21 +00001775 case SSTATE_INTERNED_IMMORTAL:
1776 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001777
Benjamin Peterson29060642009-01-31 22:14:21 +00001778 default:
1779 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001780 }
1781
Victor Stinner03490912011-10-03 23:45:12 +02001782 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001784 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001785 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001786 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1787 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001789 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790}
1791
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001792#ifdef Py_DEBUG
1793static int
1794unicode_is_singleton(PyObject *unicode)
1795{
1796 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1797 if (unicode == unicode_empty)
1798 return 1;
1799 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1800 {
1801 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1802 if (ch < 256 && unicode_latin1[ch] == unicode)
1803 return 1;
1804 }
1805 return 0;
1806}
1807#endif
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809static int
Victor Stinner488fa492011-12-12 00:01:39 +01001810unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001811{
Victor Stinner488fa492011-12-12 00:01:39 +01001812 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001813 if (Py_REFCNT(unicode) != 1)
1814 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001815 if (_PyUnicode_HASH(unicode) != -1)
1816 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001817 if (PyUnicode_CHECK_INTERNED(unicode))
1818 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001819 if (!PyUnicode_CheckExact(unicode))
1820 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001821#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001822 /* singleton refcount is greater than 1 */
1823 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001824#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001825 return 1;
1826}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001827
Victor Stinnerfe226c02011-10-03 03:52:20 +02001828static int
1829unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1830{
1831 PyObject *unicode;
1832 Py_ssize_t old_length;
1833
1834 assert(p_unicode != NULL);
1835 unicode = *p_unicode;
1836
1837 assert(unicode != NULL);
1838 assert(PyUnicode_Check(unicode));
1839 assert(0 <= length);
1840
Victor Stinner910337b2011-10-03 03:20:16 +02001841 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001842 old_length = PyUnicode_WSTR_LENGTH(unicode);
1843 else
1844 old_length = PyUnicode_GET_LENGTH(unicode);
1845 if (old_length == length)
1846 return 0;
1847
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001848 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001849 _Py_INCREF_UNICODE_EMPTY();
1850 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001852 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001853 return 0;
1854 }
1855
Victor Stinner488fa492011-12-12 00:01:39 +01001856 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001857 PyObject *copy = resize_copy(unicode, length);
1858 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001860 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001862 }
1863
Victor Stinnerfe226c02011-10-03 03:52:20 +02001864 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001865 PyObject *new_unicode = resize_compact(unicode, length);
1866 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001867 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001868 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001869 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001870 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001871 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001875PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001876{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *unicode;
1878 if (p_unicode == NULL) {
1879 PyErr_BadInternalCall();
1880 return -1;
1881 }
1882 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001883 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 {
1885 PyErr_BadInternalCall();
1886 return -1;
1887 }
1888 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001889}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001890
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001891/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001892
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001893 WARNING: The function doesn't copy the terminating null character and
1894 doesn't check the maximum character (may write a latin1 character in an
1895 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001896static void
1897unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1898 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001899{
1900 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1901 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001902 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001903
1904 switch (kind) {
1905 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001906 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001907#ifdef Py_DEBUG
1908 if (PyUnicode_IS_ASCII(unicode)) {
1909 Py_UCS4 maxchar = ucs1lib_find_max_char(
1910 (const Py_UCS1*)str,
1911 (const Py_UCS1*)str + len);
1912 assert(maxchar < 128);
1913 }
1914#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001915 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001916 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001917 }
1918 case PyUnicode_2BYTE_KIND: {
1919 Py_UCS2 *start = (Py_UCS2 *)data + index;
1920 Py_UCS2 *ucs2 = start;
1921 assert(index <= PyUnicode_GET_LENGTH(unicode));
1922
Victor Stinner184252a2012-06-16 02:57:41 +02001923 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001924 *ucs2 = (Py_UCS2)*str;
1925
1926 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001927 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001928 }
1929 default: {
1930 Py_UCS4 *start = (Py_UCS4 *)data + index;
1931 Py_UCS4 *ucs4 = start;
1932 assert(kind == PyUnicode_4BYTE_KIND);
1933 assert(index <= PyUnicode_GET_LENGTH(unicode));
1934
Victor Stinner184252a2012-06-16 02:57:41 +02001935 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001936 *ucs4 = (Py_UCS4)*str;
1937
1938 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001939 }
1940 }
1941}
1942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943static PyObject*
1944get_latin1_char(unsigned char ch)
1945{
Victor Stinnera464fc12011-10-02 20:39:30 +02001946 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001947 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001948 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 if (!unicode)
1950 return NULL;
1951 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001952 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 unicode_latin1[ch] = unicode;
1954 }
1955 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001956 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957}
1958
Victor Stinner985a82a2014-01-03 12:53:47 +01001959static PyObject*
1960unicode_char(Py_UCS4 ch)
1961{
1962 PyObject *unicode;
1963
1964 assert(ch <= MAX_UNICODE);
1965
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001966 if (ch < 256)
1967 return get_latin1_char(ch);
1968
Victor Stinner985a82a2014-01-03 12:53:47 +01001969 unicode = PyUnicode_New(1, ch);
1970 if (unicode == NULL)
1971 return NULL;
1972 switch (PyUnicode_KIND(unicode)) {
1973 case PyUnicode_1BYTE_KIND:
1974 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1975 break;
1976 case PyUnicode_2BYTE_KIND:
1977 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1978 break;
1979 default:
1980 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1981 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1982 }
1983 assert(_PyUnicode_CheckConsistency(unicode, 1));
1984 return unicode;
1985}
1986
Alexander Belopolsky40018472011-02-26 01:02:56 +00001987PyObject *
1988PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001990 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 Py_UCS4 maxchar = 0;
1992 Py_ssize_t num_surrogates;
1993
1994 if (u == NULL)
1995 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001997 /* If the Unicode data is known at construction time, we can apply
1998 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002001 if (size == 0)
2002 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 /* Single character Unicode objects in the Latin-1 range are
2005 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002006 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return get_latin1_char((unsigned char)*u);
2008
2009 /* If not empty and not single character, copy the Unicode data
2010 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002011 if (find_maxchar_surrogates(u, u + size,
2012 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014
Victor Stinner8faf8212011-12-08 22:14:11 +01002015 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 if (!unicode)
2017 return NULL;
2018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 switch (PyUnicode_KIND(unicode)) {
2020 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002021 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2023 break;
2024 case PyUnicode_2BYTE_KIND:
2025#if Py_UNICODE_SIZE == 2
2026 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2027#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002028 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2030#endif
2031 break;
2032 case PyUnicode_4BYTE_KIND:
2033#if SIZEOF_WCHAR_T == 2
2034 /* This is the only case which has to process surrogates, thus
2035 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002036 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037#else
2038 assert(num_surrogates == 0);
2039 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2040#endif
2041 break;
2042 default:
2043 assert(0 && "Impossible state");
2044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002046 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047}
2048
Alexander Belopolsky40018472011-02-26 01:02:56 +00002049PyObject *
2050PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 if (size < 0) {
2053 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 return NULL;
2056 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002057 if (u != NULL)
2058 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2059 else
2060 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002061}
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063PyObject *
2064PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002065{
2066 size_t size = strlen(u);
2067 if (size > PY_SSIZE_T_MAX) {
2068 PyErr_SetString(PyExc_OverflowError, "input too long");
2069 return NULL;
2070 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002071 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002072}
2073
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002074PyObject *
2075_PyUnicode_FromId(_Py_Identifier *id)
2076{
2077 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002078 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2079 strlen(id->string),
2080 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002081 if (!id->object)
2082 return NULL;
2083 PyUnicode_InternInPlace(&id->object);
2084 assert(!id->next);
2085 id->next = static_strings;
2086 static_strings = id;
2087 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002088 return id->object;
2089}
2090
2091void
2092_PyUnicode_ClearStaticStrings()
2093{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002094 _Py_Identifier *tmp, *s = static_strings;
2095 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002096 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002097 tmp = s->next;
2098 s->next = NULL;
2099 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002100 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002101 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002102}
2103
Benjamin Peterson0df54292012-03-26 14:50:32 -04002104/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002105
Victor Stinnerd3f08822012-05-29 12:57:52 +02002106PyObject*
2107_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002108{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002109 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002110 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002111 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002112#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002113 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002114#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002115 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002116 }
Victor Stinner785938e2011-12-11 20:09:03 +01002117 unicode = PyUnicode_New(size, 127);
2118 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002119 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002120 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2121 assert(_PyUnicode_CheckConsistency(unicode, 1));
2122 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002123}
2124
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002125static Py_UCS4
2126kind_maxchar_limit(unsigned int kind)
2127{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002128 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002129 case PyUnicode_1BYTE_KIND:
2130 return 0x80;
2131 case PyUnicode_2BYTE_KIND:
2132 return 0x100;
2133 case PyUnicode_4BYTE_KIND:
2134 return 0x10000;
2135 default:
2136 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01002137 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002138 }
2139}
2140
Victor Stinnere6abb482012-05-02 01:15:40 +02002141Py_LOCAL_INLINE(Py_UCS4)
2142align_maxchar(Py_UCS4 maxchar)
2143{
2144 if (maxchar <= 127)
2145 return 127;
2146 else if (maxchar <= 255)
2147 return 255;
2148 else if (maxchar <= 65535)
2149 return 65535;
2150 else
2151 return MAX_UNICODE;
2152}
2153
Victor Stinner702c7342011-10-05 13:50:52 +02002154static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002155_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002158 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002159
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002162 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002163 if (size == 1)
2164 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002165
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002166 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002167 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 if (!res)
2169 return NULL;
2170 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002171 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002173}
2174
Victor Stinnere57b1c02011-09-28 22:20:48 +02002175static PyObject*
2176_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177{
2178 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002179 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002180
Serhiy Storchaka678db842013-01-26 12:16:36 +02002181 if (size == 0)
2182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002183 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 if (size == 1)
2185 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002186
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002187 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002188 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 if (!res)
2190 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002191 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002193 else {
2194 _PyUnicode_CONVERT_BYTES(
2195 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2196 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002197 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return res;
2199}
2200
Victor Stinnere57b1c02011-09-28 22:20:48 +02002201static PyObject*
2202_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203{
2204 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002206
Serhiy Storchaka678db842013-01-26 12:16:36 +02002207 if (size == 0)
2208 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002209 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002210 if (size == 1)
2211 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002213 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002214 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 if (!res)
2216 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002217 if (max_char < 256)
2218 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2219 PyUnicode_1BYTE_DATA(res));
2220 else if (max_char < 0x10000)
2221 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2222 PyUnicode_2BYTE_DATA(res));
2223 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002225 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 return res;
2227}
2228
2229PyObject*
2230PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2231{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002232 if (size < 0) {
2233 PyErr_SetString(PyExc_ValueError, "size must be positive");
2234 return NULL;
2235 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002236 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002238 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002240 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002242 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002243 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002244 PyErr_SetString(PyExc_SystemError, "invalid kind");
2245 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247}
2248
Victor Stinnerece58de2012-04-23 23:36:38 +02002249Py_UCS4
2250_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2251{
2252 enum PyUnicode_Kind kind;
2253 void *startptr, *endptr;
2254
2255 assert(PyUnicode_IS_READY(unicode));
2256 assert(0 <= start);
2257 assert(end <= PyUnicode_GET_LENGTH(unicode));
2258 assert(start <= end);
2259
2260 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2261 return PyUnicode_MAX_CHAR_VALUE(unicode);
2262
2263 if (start == end)
2264 return 127;
2265
Victor Stinner94d558b2012-04-27 22:26:58 +02002266 if (PyUnicode_IS_ASCII(unicode))
2267 return 127;
2268
Victor Stinnerece58de2012-04-23 23:36:38 +02002269 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002270 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002271 endptr = (char *)startptr + end * kind;
2272 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002273 switch(kind) {
2274 case PyUnicode_1BYTE_KIND:
2275 return ucs1lib_find_max_char(startptr, endptr);
2276 case PyUnicode_2BYTE_KIND:
2277 return ucs2lib_find_max_char(startptr, endptr);
2278 case PyUnicode_4BYTE_KIND:
2279 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002280 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002281 assert(0);
2282 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002283 }
2284}
2285
Victor Stinner25a4b292011-10-06 12:31:55 +02002286/* Ensure that a string uses the most efficient storage, if it is not the
2287 case: create a new string with of the right kind. Write NULL into *p_unicode
2288 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002289static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002290unicode_adjust_maxchar(PyObject **p_unicode)
2291{
2292 PyObject *unicode, *copy;
2293 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002294 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002295 unsigned int kind;
2296
2297 assert(p_unicode != NULL);
2298 unicode = *p_unicode;
2299 assert(PyUnicode_IS_READY(unicode));
2300 if (PyUnicode_IS_ASCII(unicode))
2301 return;
2302
2303 len = PyUnicode_GET_LENGTH(unicode);
2304 kind = PyUnicode_KIND(unicode);
2305 if (kind == PyUnicode_1BYTE_KIND) {
2306 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 max_char = ucs1lib_find_max_char(u, u + len);
2308 if (max_char >= 128)
2309 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002310 }
2311 else if (kind == PyUnicode_2BYTE_KIND) {
2312 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002313 max_char = ucs2lib_find_max_char(u, u + len);
2314 if (max_char >= 256)
2315 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002316 }
2317 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002319 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002320 max_char = ucs4lib_find_max_char(u, u + len);
2321 if (max_char >= 0x10000)
2322 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002323 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002324 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002325 if (copy != NULL)
2326 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002327 Py_DECREF(unicode);
2328 *p_unicode = copy;
2329}
2330
Victor Stinner034f6cf2011-09-30 02:26:44 +02002331PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002332_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002333{
Victor Stinner87af4f22011-11-21 23:03:47 +01002334 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002335 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002336
Victor Stinner034f6cf2011-09-30 02:26:44 +02002337 if (!PyUnicode_Check(unicode)) {
2338 PyErr_BadInternalCall();
2339 return NULL;
2340 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002341 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002342 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002343
Victor Stinner87af4f22011-11-21 23:03:47 +01002344 length = PyUnicode_GET_LENGTH(unicode);
2345 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002346 if (!copy)
2347 return NULL;
2348 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2349
Victor Stinner87af4f22011-11-21 23:03:47 +01002350 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2351 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002352 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002353 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002354}
2355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356
Victor Stinnerbc603d12011-10-02 01:00:40 +02002357/* Widen Unicode objects to larger buffers. Don't write terminating null
2358 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359
2360void*
2361_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2362{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002363 Py_ssize_t len;
2364 void *result;
2365 unsigned int skind;
2366
Benjamin Petersonbac79492012-01-14 13:34:47 -05002367 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002368 return NULL;
2369
2370 len = PyUnicode_GET_LENGTH(s);
2371 skind = PyUnicode_KIND(s);
2372 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002373 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 return NULL;
2375 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002376 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002377 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002378 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002379 if (!result)
2380 return PyErr_NoMemory();
2381 assert(skind == PyUnicode_1BYTE_KIND);
2382 _PyUnicode_CONVERT_BYTES(
2383 Py_UCS1, Py_UCS2,
2384 PyUnicode_1BYTE_DATA(s),
2385 PyUnicode_1BYTE_DATA(s) + len,
2386 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002388 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002389 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002390 if (!result)
2391 return PyErr_NoMemory();
2392 if (skind == PyUnicode_2BYTE_KIND) {
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS2, Py_UCS4,
2395 PyUnicode_2BYTE_DATA(s),
2396 PyUnicode_2BYTE_DATA(s) + len,
2397 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002399 else {
2400 assert(skind == PyUnicode_1BYTE_KIND);
2401 _PyUnicode_CONVERT_BYTES(
2402 Py_UCS1, Py_UCS4,
2403 PyUnicode_1BYTE_DATA(s),
2404 PyUnicode_1BYTE_DATA(s) + len,
2405 result);
2406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002408 default:
2409 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 }
Victor Stinner01698042011-10-04 00:04:26 +02002411 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return NULL;
2413}
2414
2415static Py_UCS4*
2416as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2417 int copy_null)
2418{
2419 int kind;
2420 void *data;
2421 Py_ssize_t len, targetlen;
2422 if (PyUnicode_READY(string) == -1)
2423 return NULL;
2424 kind = PyUnicode_KIND(string);
2425 data = PyUnicode_DATA(string);
2426 len = PyUnicode_GET_LENGTH(string);
2427 targetlen = len;
2428 if (copy_null)
2429 targetlen++;
2430 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002431 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 if (!target) {
2433 PyErr_NoMemory();
2434 return NULL;
2435 }
2436 }
2437 else {
2438 if (targetsize < targetlen) {
2439 PyErr_Format(PyExc_SystemError,
2440 "string is longer than the buffer");
2441 if (copy_null && 0 < targetsize)
2442 target[0] = 0;
2443 return NULL;
2444 }
2445 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002446 if (kind == PyUnicode_1BYTE_KIND) {
2447 Py_UCS1 *start = (Py_UCS1 *) data;
2448 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002450 else if (kind == PyUnicode_2BYTE_KIND) {
2451 Py_UCS2 *start = (Py_UCS2 *) data;
2452 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2453 }
2454 else {
2455 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 if (copy_null)
2459 target[len] = 0;
2460 return target;
2461}
2462
2463Py_UCS4*
2464PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002467 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 PyErr_BadInternalCall();
2469 return NULL;
2470 }
2471 return as_ucs4(string, target, targetsize, copy_null);
2472}
2473
2474Py_UCS4*
2475PyUnicode_AsUCS4Copy(PyObject *string)
2476{
2477 return as_ucs4(string, NULL, 0, 1);
2478}
2479
2480#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002481
Alexander Belopolsky40018472011-02-26 01:02:56 +00002482PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002483PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002485 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002487 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
2491
Martin v. Löwis790465f2008-04-05 20:41:37 +00002492 if (size == -1) {
2493 size = wcslen(w);
2494 }
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497}
2498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002500
Victor Stinner15a11362012-10-06 23:48:20 +02002501/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002502 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2503 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2504#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002505
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002506static int
2507unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2508 Py_ssize_t width, Py_ssize_t precision)
2509{
2510 Py_ssize_t length, fill, arglen;
2511 Py_UCS4 maxchar;
2512
2513 if (PyUnicode_READY(str) == -1)
2514 return -1;
2515
2516 length = PyUnicode_GET_LENGTH(str);
2517 if ((precision == -1 || precision >= length)
2518 && width <= length)
2519 return _PyUnicodeWriter_WriteStr(writer, str);
2520
2521 if (precision != -1)
2522 length = Py_MIN(precision, length);
2523
2524 arglen = Py_MAX(length, width);
2525 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2526 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2527 else
2528 maxchar = writer->maxchar;
2529
2530 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2531 return -1;
2532
2533 if (width > length) {
2534 fill = width - length;
2535 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2536 return -1;
2537 writer->pos += fill;
2538 }
2539
2540 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2541 str, 0, length);
2542 writer->pos += length;
2543 return 0;
2544}
2545
2546static int
2547unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2548 Py_ssize_t width, Py_ssize_t precision)
2549{
2550 /* UTF-8 */
2551 Py_ssize_t length;
2552 PyObject *unicode;
2553 int res;
2554
2555 length = strlen(str);
2556 if (precision != -1)
2557 length = Py_MIN(length, precision);
2558 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2559 if (unicode == NULL)
2560 return -1;
2561
2562 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2563 Py_DECREF(unicode);
2564 return res;
2565}
2566
Victor Stinner96865452011-03-01 23:44:09 +00002567static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002568unicode_fromformat_arg(_PyUnicodeWriter *writer,
2569 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002570{
Victor Stinnere215d962012-10-06 23:03:36 +02002571 const char *p;
2572 Py_ssize_t len;
2573 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002574 Py_ssize_t width;
2575 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002576 int longflag;
2577 int longlongflag;
2578 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002579 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580
2581 p = f;
2582 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002583 zeropad = 0;
2584 if (*f == '0') {
2585 zeropad = 1;
2586 f++;
2587 }
Victor Stinner96865452011-03-01 23:44:09 +00002588
2589 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002590 width = -1;
2591 if (Py_ISDIGIT((unsigned)*f)) {
2592 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002593 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002596 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002597 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002598 return NULL;
2599 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002600 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002601 f++;
2602 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002603 }
2604 precision = -1;
2605 if (*f == '.') {
2606 f++;
2607 if (Py_ISDIGIT((unsigned)*f)) {
2608 precision = (*f - '0');
2609 f++;
2610 while (Py_ISDIGIT((unsigned)*f)) {
2611 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2612 PyErr_SetString(PyExc_ValueError,
2613 "precision too big");
2614 return NULL;
2615 }
2616 precision = (precision * 10) + (*f - '0');
2617 f++;
2618 }
2619 }
Victor Stinner96865452011-03-01 23:44:09 +00002620 if (*f == '%') {
2621 /* "%.3%s" => f points to "3" */
2622 f--;
2623 }
2624 }
2625 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002627 f--;
2628 }
Victor Stinner96865452011-03-01 23:44:09 +00002629
2630 /* Handle %ld, %lu, %lld and %llu. */
2631 longflag = 0;
2632 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002633 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002635 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002636 longflag = 1;
2637 ++f;
2638 }
2639#ifdef HAVE_LONG_LONG
2640 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002641 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002642 longlongflag = 1;
2643 f += 2;
2644 }
2645#endif
2646 }
2647 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002648 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002649 size_tflag = 1;
2650 ++f;
2651 }
Victor Stinnere215d962012-10-06 23:03:36 +02002652
2653 if (f[1] == '\0')
2654 writer->overallocate = 0;
2655
2656 switch (*f) {
2657 case 'c':
2658 {
2659 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002660 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002661 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002662 "character argument not in range(0x110000)");
2663 return NULL;
2664 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002665 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002666 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002667 break;
2668 }
2669
2670 case 'i':
2671 case 'd':
2672 case 'u':
2673 case 'x':
2674 {
2675 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002676 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002677 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002678
2679 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002680 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002681 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002682 va_arg(*vargs, unsigned long));
2683#ifdef HAVE_LONG_LONG
2684 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002685 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002686 va_arg(*vargs, unsigned PY_LONG_LONG));
2687#endif
2688 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002689 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_arg(*vargs, size_t));
2691 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002692 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002693 va_arg(*vargs, unsigned int));
2694 }
2695 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002696 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002697 }
2698 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002699 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002700 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002701 va_arg(*vargs, long));
2702#ifdef HAVE_LONG_LONG
2703 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002704 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002705 va_arg(*vargs, PY_LONG_LONG));
2706#endif
2707 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002708 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002709 va_arg(*vargs, Py_ssize_t));
2710 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002711 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002712 va_arg(*vargs, int));
2713 }
2714 assert(len >= 0);
2715
Victor Stinnere215d962012-10-06 23:03:36 +02002716 if (precision < len)
2717 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002718
2719 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2721 return NULL;
2722
Victor Stinnere215d962012-10-06 23:03:36 +02002723 if (width > precision) {
2724 Py_UCS4 fillchar;
2725 fill = width - precision;
2726 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002727 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2728 return NULL;
2729 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002730 }
Victor Stinner15a11362012-10-06 23:48:20 +02002731 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002732 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002733 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2734 return NULL;
2735 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002736 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002737
Victor Stinner4a587072013-11-19 12:54:53 +01002738 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2739 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002740 break;
2741 }
2742
2743 case 'p':
2744 {
2745 char number[MAX_LONG_LONG_CHARS];
2746
2747 len = sprintf(number, "%p", va_arg(*vargs, void*));
2748 assert(len >= 0);
2749
2750 /* %p is ill-defined: ensure leading 0x. */
2751 if (number[1] == 'X')
2752 number[1] = 'x';
2753 else if (number[1] != 'x') {
2754 memmove(number + 2, number,
2755 strlen(number) + 1);
2756 number[0] = '0';
2757 number[1] = 'x';
2758 len += 2;
2759 }
2760
Victor Stinner4a587072013-11-19 12:54:53 +01002761 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002762 return NULL;
2763 break;
2764 }
2765
2766 case 's':
2767 {
2768 /* UTF-8 */
2769 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002770 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002771 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002772 break;
2773 }
2774
2775 case 'U':
2776 {
2777 PyObject *obj = va_arg(*vargs, PyObject *);
2778 assert(obj && _PyUnicode_CHECK(obj));
2779
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002781 return NULL;
2782 break;
2783 }
2784
2785 case 'V':
2786 {
2787 PyObject *obj = va_arg(*vargs, PyObject *);
2788 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002789 if (obj) {
2790 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002791 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002792 return NULL;
2793 }
2794 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 assert(str != NULL);
2796 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002797 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002798 }
2799 break;
2800 }
2801
2802 case 'S':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 PyObject *str;
2806 assert(obj);
2807 str = PyObject_Str(obj);
2808 if (!str)
2809 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002811 Py_DECREF(str);
2812 return NULL;
2813 }
2814 Py_DECREF(str);
2815 break;
2816 }
2817
2818 case 'R':
2819 {
2820 PyObject *obj = va_arg(*vargs, PyObject *);
2821 PyObject *repr;
2822 assert(obj);
2823 repr = PyObject_Repr(obj);
2824 if (!repr)
2825 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002827 Py_DECREF(repr);
2828 return NULL;
2829 }
2830 Py_DECREF(repr);
2831 break;
2832 }
2833
2834 case 'A':
2835 {
2836 PyObject *obj = va_arg(*vargs, PyObject *);
2837 PyObject *ascii;
2838 assert(obj);
2839 ascii = PyObject_ASCII(obj);
2840 if (!ascii)
2841 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002842 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002843 Py_DECREF(ascii);
2844 return NULL;
2845 }
2846 Py_DECREF(ascii);
2847 break;
2848 }
2849
2850 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002851 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002852 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 break;
2854
2855 default:
2856 /* if we stumble upon an unknown formatting code, copy the rest
2857 of the format string to the output string. (we cannot just
2858 skip the code, since there's no way to know what's in the
2859 argument list) */
2860 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
2863 f = p+len;
2864 return f;
2865 }
2866
2867 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002868 return f;
2869}
2870
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871PyObject *
2872PyUnicode_FromFormatV(const char *format, va_list vargs)
2873{
Victor Stinnere215d962012-10-06 23:03:36 +02002874 va_list vargs2;
2875 const char *f;
2876 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877
Victor Stinner8f674cc2013-04-17 23:02:17 +02002878 _PyUnicodeWriter_Init(&writer);
2879 writer.min_length = strlen(format) + 100;
2880 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002881
2882 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2883 Copy it to be able to pass a reference to a subfunction. */
2884 Py_VA_COPY(vargs2, vargs);
2885
2886 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002887 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002888 f = unicode_fromformat_arg(&writer, f, &vargs2);
2889 if (f == NULL)
2890 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002893 const char *p;
2894 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002895
Victor Stinnere215d962012-10-06 23:03:36 +02002896 p = f;
2897 do
2898 {
2899 if ((unsigned char)*p > 127) {
2900 PyErr_Format(PyExc_ValueError,
2901 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2902 "string, got a non-ASCII byte: 0x%02x",
2903 (unsigned char)*p);
2904 return NULL;
2905 }
2906 p++;
2907 }
2908 while (*p != '\0' && *p != '%');
2909 len = p - f;
2910
2911 if (*p == '\0')
2912 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002913
2914 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002915 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002916
2917 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002918 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002919 }
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return _PyUnicodeWriter_Finish(&writer);
2921
2922 fail:
2923 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002924 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002925}
2926
Walter Dörwaldd2034312007-05-18 16:29:38 +00002927PyObject *
2928PyUnicode_FromFormat(const char *format, ...)
2929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002930 PyObject* ret;
2931 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002932
2933#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002934 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 ret = PyUnicode_FromFormatV(format, vargs);
2939 va_end(vargs);
2940 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#ifdef HAVE_WCHAR_H
2944
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2946 convert a Unicode object to a wide character string.
2947
Victor Stinnerd88d9832011-09-06 02:00:05 +02002948 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002949 character) required to convert the unicode object. Ignore size argument.
2950
Victor Stinnerd88d9832011-09-06 02:00:05 +02002951 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002952 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002953 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002954static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002956 wchar_t *w,
2957 Py_ssize_t size)
2958{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002959 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 const wchar_t *wstr;
2961
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002962 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 if (wstr == NULL)
2964 return -1;
2965
Victor Stinner5593d8a2010-10-02 11:11:27 +00002966 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 if (size > res)
2968 size = res + 1;
2969 else
2970 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002971 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002972 return res;
2973 }
2974 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002976}
2977
2978Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002979PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 wchar_t *w,
2981 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982{
2983 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 PyErr_BadInternalCall();
2985 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002987 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
2994 wchar_t* buffer;
2995 Py_ssize_t buflen;
2996
2997 if (unicode == NULL) {
2998 PyErr_BadInternalCall();
2999 return NULL;
3000 }
3001
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003002 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 if (buflen == -1)
3004 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003005 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00003006 if (buffer == NULL) {
3007 PyErr_NoMemory();
3008 return NULL;
3009 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003010 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02003011 if (buflen == -1) {
3012 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02003014 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00003015 if (size != NULL)
3016 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003017 return buffer;
3018}
3019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003024{
Victor Stinner8faf8212011-12-08 22:14:11 +01003025 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003026 PyErr_SetString(PyExc_ValueError,
3027 "chr() arg not in range(0x110000)");
3028 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003029 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003030
Victor Stinner985a82a2014-01-03 12:53:47 +01003031 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003032}
3033
Alexander Belopolsky40018472011-02-26 01:02:56 +00003034PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003035PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003037 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003039 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003040 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003041 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 Py_INCREF(obj);
3043 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003044 }
3045 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 /* For a Unicode subtype that's not a Unicode object,
3047 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003048 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003050 PyErr_Format(PyExc_TypeError,
3051 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003052 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003053 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003057PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003060{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003062 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003065 PyErr_BadInternalCall();
3066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003068
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003069 /* Decoding bytes objects is the most common case and should be fast */
3070 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003071 if (PyBytes_GET_SIZE(obj) == 0)
3072 _Py_RETURN_UNICODE_EMPTY();
3073 v = PyUnicode_Decode(
3074 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3075 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003076 return v;
3077 }
3078
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003079 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 PyErr_SetString(PyExc_TypeError,
3081 "decoding str is not supported");
3082 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003083 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003084
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003085 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3086 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3087 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003088 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 Py_TYPE(obj)->tp_name);
3090 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003091 }
Tim Petersced69f82003-09-16 20:30:58 +00003092
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003094 PyBuffer_Release(&buffer);
3095 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003097
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003099 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003100 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101}
3102
Victor Stinner600d3be2010-06-10 12:00:55 +00003103/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003104 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3105 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003106int
3107_Py_normalize_encoding(const char *encoding,
3108 char *lower,
3109 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003111 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003112 char *l;
3113 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003114
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003115 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01003116 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01003117 if (lower_len < 6)
3118 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003119 strcpy(lower, "utf-8");
3120 return 1;
3121 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 e = encoding;
3123 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003124 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003125 while (*e) {
3126 if (l == l_end)
3127 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003128 if (Py_ISUPPER(*e)) {
3129 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003130 }
3131 else if (*e == '_') {
3132 *l++ = '-';
3133 e++;
3134 }
3135 else {
3136 *l++ = *e++;
3137 }
3138 }
3139 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003140 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003141}
3142
Alexander Belopolsky40018472011-02-26 01:02:56 +00003143PyObject *
3144PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003145 Py_ssize_t size,
3146 const char *encoding,
3147 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003148{
3149 PyObject *buffer = NULL, *unicode;
3150 Py_buffer info;
3151 char lower[11]; /* Enough for any encoding shortcut */
3152
Fred Drakee4315f52000-05-09 19:53:39 +00003153 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003154 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003155 if ((strcmp(lower, "utf-8") == 0) ||
3156 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003157 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003158 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003159 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003160 (strcmp(lower, "iso-8859-1") == 0) ||
3161 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003162 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003163#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003164 else if (strcmp(lower, "mbcs") == 0)
3165 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003166#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003167 else if (strcmp(lower, "ascii") == 0)
3168 return PyUnicode_DecodeASCII(s, size, errors);
3169 else if (strcmp(lower, "utf-16") == 0)
3170 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3171 else if (strcmp(lower, "utf-32") == 0)
3172 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174
3175 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003176 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003177 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003178 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003179 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 if (buffer == NULL)
3181 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003182 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 if (unicode == NULL)
3184 goto onError;
3185 if (!PyUnicode_Check(unicode)) {
3186 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003187 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3188 "use codecs.decode() to decode to arbitrary types",
3189 encoding,
3190 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 Py_DECREF(unicode);
3192 goto onError;
3193 }
3194 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003195 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003196
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 Py_XDECREF(buffer);
3199 return NULL;
3200}
3201
Alexander Belopolsky40018472011-02-26 01:02:56 +00003202PyObject *
3203PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003204 const char *encoding,
3205 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003206{
3207 PyObject *v;
3208
3209 if (!PyUnicode_Check(unicode)) {
3210 PyErr_BadArgument();
3211 goto onError;
3212 }
3213
3214 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003215 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003216
3217 /* Decode via the codec registry */
3218 v = PyCodec_Decode(unicode, encoding, errors);
3219 if (v == NULL)
3220 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003221 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003222
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003224 return NULL;
3225}
3226
Alexander Belopolsky40018472011-02-26 01:02:56 +00003227PyObject *
3228PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003229 const char *encoding,
3230 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003231{
3232 PyObject *v;
3233
3234 if (!PyUnicode_Check(unicode)) {
3235 PyErr_BadArgument();
3236 goto onError;
3237 }
3238
3239 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003241
3242 /* Decode via the codec registry */
3243 v = PyCodec_Decode(unicode, encoding, errors);
3244 if (v == NULL)
3245 goto onError;
3246 if (!PyUnicode_Check(v)) {
3247 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003248 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3249 "use codecs.decode() to decode to arbitrary types",
3250 encoding,
3251 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003252 Py_DECREF(v);
3253 goto onError;
3254 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003255 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003256
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003258 return NULL;
3259}
3260
Alexander Belopolsky40018472011-02-26 01:02:56 +00003261PyObject *
3262PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003263 Py_ssize_t size,
3264 const char *encoding,
3265 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266{
3267 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003268
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 unicode = PyUnicode_FromUnicode(s, size);
3270 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3273 Py_DECREF(unicode);
3274 return v;
3275}
3276
Alexander Belopolsky40018472011-02-26 01:02:56 +00003277PyObject *
3278PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003279 const char *encoding,
3280 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003281{
3282 PyObject *v;
3283
3284 if (!PyUnicode_Check(unicode)) {
3285 PyErr_BadArgument();
3286 goto onError;
3287 }
3288
3289 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003290 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003291
3292 /* Encode via the codec registry */
3293 v = PyCodec_Encode(unicode, encoding, errors);
3294 if (v == NULL)
3295 goto onError;
3296 return v;
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003299 return NULL;
3300}
3301
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003302static size_t
3303wcstombs_errorpos(const wchar_t *wstr)
3304{
3305 size_t len;
3306#if SIZEOF_WCHAR_T == 2
3307 wchar_t buf[3];
3308#else
3309 wchar_t buf[2];
3310#endif
3311 char outbuf[MB_LEN_MAX];
3312 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003314#if SIZEOF_WCHAR_T == 2
3315 buf[2] = 0;
3316#else
3317 buf[1] = 0;
3318#endif
3319 start = wstr;
3320 while (*wstr != L'\0')
3321 {
3322 previous = wstr;
3323#if SIZEOF_WCHAR_T == 2
3324 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3325 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3326 {
3327 buf[0] = wstr[0];
3328 buf[1] = wstr[1];
3329 wstr += 2;
3330 }
3331 else {
3332 buf[0] = *wstr;
3333 buf[1] = 0;
3334 wstr++;
3335 }
3336#else
3337 buf[0] = *wstr;
3338 wstr++;
3339#endif
3340 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003341 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003343 }
3344
3345 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003346 return 0;
3347}
3348
Victor Stinner1b579672011-12-17 05:47:23 +01003349static int
3350locale_error_handler(const char *errors, int *surrogateescape)
3351{
Victor Stinner50149202015-09-22 00:26:54 +02003352 _Py_error_handler error_handler = get_error_handler(errors);
3353 switch (error_handler)
3354 {
3355 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003356 *surrogateescape = 0;
3357 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003358 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003359 *surrogateescape = 1;
3360 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003361 default:
3362 PyErr_Format(PyExc_ValueError,
3363 "only 'strict' and 'surrogateescape' error handlers "
3364 "are supported, not '%s'",
3365 errors);
3366 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003367 }
Victor Stinner1b579672011-12-17 05:47:23 +01003368}
3369
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003371PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003372{
3373 Py_ssize_t wlen, wlen2;
3374 wchar_t *wstr;
3375 PyObject *bytes = NULL;
3376 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003377 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 PyObject *exc;
3379 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003380 int surrogateescape;
3381
3382 if (locale_error_handler(errors, &surrogateescape) < 0)
3383 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003384
3385 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3386 if (wstr == NULL)
3387 return NULL;
3388
3389 wlen2 = wcslen(wstr);
3390 if (wlen2 != wlen) {
3391 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003392 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003393 return NULL;
3394 }
3395
3396 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003397 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003398 char *str;
3399
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003400 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003401 if (str == NULL) {
3402 if (error_pos == (size_t)-1) {
3403 PyErr_NoMemory();
3404 PyMem_Free(wstr);
3405 return NULL;
3406 }
3407 else {
3408 goto encode_error;
3409 }
3410 }
3411 PyMem_Free(wstr);
3412
3413 bytes = PyBytes_FromString(str);
3414 PyMem_Free(str);
3415 }
3416 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003417 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003418 size_t len, len2;
3419
3420 len = wcstombs(NULL, wstr, 0);
3421 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003422 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003423 goto encode_error;
3424 }
3425
3426 bytes = PyBytes_FromStringAndSize(NULL, len);
3427 if (bytes == NULL) {
3428 PyMem_Free(wstr);
3429 return NULL;
3430 }
3431
3432 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3433 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003434 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003435 goto encode_error;
3436 }
3437 PyMem_Free(wstr);
3438 }
3439 return bytes;
3440
3441encode_error:
3442 errmsg = strerror(errno);
3443 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003444
3445 if (error_pos == (size_t)-1)
3446 error_pos = wcstombs_errorpos(wstr);
3447
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003448 PyMem_Free(wstr);
3449 Py_XDECREF(bytes);
3450
Victor Stinner2f197072011-12-17 07:08:30 +01003451 if (errmsg != NULL) {
3452 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003453 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003454 if (wstr != NULL) {
3455 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003456 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003457 } else
3458 errmsg = NULL;
3459 }
3460 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003461 reason = PyUnicode_FromString(
3462 "wcstombs() encountered an unencodable "
3463 "wide character");
3464 if (reason == NULL)
3465 return NULL;
3466
3467 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3468 "locale", unicode,
3469 (Py_ssize_t)error_pos,
3470 (Py_ssize_t)(error_pos+1),
3471 reason);
3472 Py_DECREF(reason);
3473 if (exc != NULL) {
3474 PyCodec_StrictErrors(exc);
3475 Py_XDECREF(exc);
3476 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003477 return NULL;
3478}
3479
Victor Stinnerad158722010-10-27 00:25:46 +00003480PyObject *
3481PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003482{
Victor Stinner99b95382011-07-04 14:23:54 +02003483#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003484 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003485#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003486 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003487#else
Victor Stinner793b5312011-04-27 00:24:21 +02003488 PyInterpreterState *interp = PyThreadState_GET()->interp;
3489 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3490 cannot use it to encode and decode filenames before it is loaded. Load
3491 the Python codec requires to encode at least its own filename. Use the C
3492 version of the locale codec until the codec registry is initialized and
3493 the Python codec is loaded.
3494
3495 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3496 cannot only rely on it: check also interp->fscodec_initialized for
3497 subinterpreters. */
3498 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003499 return PyUnicode_AsEncodedString(unicode,
3500 Py_FileSystemDefaultEncoding,
3501 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003502 }
3503 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003504 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003505 }
Victor Stinnerad158722010-10-27 00:25:46 +00003506#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 const char *encoding,
3512 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003513{
3514 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003515 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003516
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 if (!PyUnicode_Check(unicode)) {
3518 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 }
Fred Drakee4315f52000-05-09 19:53:39 +00003521
Fred Drakee4315f52000-05-09 19:53:39 +00003522 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003523 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003524 if ((strcmp(lower, "utf-8") == 0) ||
3525 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003526 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003527 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003528 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003529 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003530 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003531 }
Victor Stinner37296e82010-06-10 13:36:23 +00003532 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003533 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003534 (strcmp(lower, "iso-8859-1") == 0) ||
3535 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003536 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003537#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003538 else if (strcmp(lower, "mbcs") == 0)
3539 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003540#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003541 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003542 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544
3545 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003546 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003548 return NULL;
3549
3550 /* The normal path */
3551 if (PyBytes_Check(v))
3552 return v;
3553
3554 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003555 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003556 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003557 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003558
3559 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003560 "encoder %s returned bytearray instead of bytes; "
3561 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003562 encoding);
3563 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003564 Py_DECREF(v);
3565 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003566 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003567
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003568 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3569 Py_DECREF(v);
3570 return b;
3571 }
3572
3573 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003574 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3575 "use codecs.encode() to encode to arbitrary types",
3576 encoding,
3577 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003578 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003579 return NULL;
3580}
3581
Alexander Belopolsky40018472011-02-26 01:02:56 +00003582PyObject *
3583PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003584 const char *encoding,
3585 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003586{
3587 PyObject *v;
3588
3589 if (!PyUnicode_Check(unicode)) {
3590 PyErr_BadArgument();
3591 goto onError;
3592 }
3593
3594 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003596
3597 /* Encode via the codec registry */
3598 v = PyCodec_Encode(unicode, encoding, errors);
3599 if (v == NULL)
3600 goto onError;
3601 if (!PyUnicode_Check(v)) {
3602 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003603 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3604 "use codecs.encode() to encode to arbitrary types",
3605 encoding,
3606 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003607 Py_DECREF(v);
3608 goto onError;
3609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003611
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 return NULL;
3614}
3615
Victor Stinner2f197072011-12-17 07:08:30 +01003616static size_t
3617mbstowcs_errorpos(const char *str, size_t len)
3618{
3619#ifdef HAVE_MBRTOWC
3620 const char *start = str;
3621 mbstate_t mbs;
3622 size_t converted;
3623 wchar_t ch;
3624
3625 memset(&mbs, 0, sizeof mbs);
3626 while (len)
3627 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003628 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003629 if (converted == 0)
3630 /* Reached end of string */
3631 break;
3632 if (converted == (size_t)-1 || converted == (size_t)-2) {
3633 /* Conversion error or incomplete character */
3634 return str - start;
3635 }
3636 else {
3637 str += converted;
3638 len -= converted;
3639 }
3640 }
3641 /* failed to find the undecodable byte sequence */
3642 return 0;
3643#endif
3644 return 0;
3645}
3646
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003648PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003649 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003650{
3651 wchar_t smallbuf[256];
3652 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3653 wchar_t *wstr;
3654 size_t wlen, wlen2;
3655 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003656 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003657 size_t error_pos;
3658 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003659 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3660 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003661
3662 if (locale_error_handler(errors, &surrogateescape) < 0)
3663 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003664
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003665 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3666 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003667 return NULL;
3668 }
3669
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003670 if (surrogateescape) {
3671 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003672 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003673 if (wstr == NULL) {
3674 if (wlen == (size_t)-1)
3675 PyErr_NoMemory();
3676 else
3677 PyErr_SetFromErrno(PyExc_OSError);
3678 return NULL;
3679 }
3680
3681 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003682 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003683 }
3684 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003685 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003686#ifndef HAVE_BROKEN_MBSTOWCS
3687 wlen = mbstowcs(NULL, str, 0);
3688#else
3689 wlen = len;
3690#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003691 if (wlen == (size_t)-1)
3692 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003693 if (wlen+1 <= smallbuf_len) {
3694 wstr = smallbuf;
3695 }
3696 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003697 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003698 if (!wstr)
3699 return PyErr_NoMemory();
3700 }
3701
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003702 wlen2 = mbstowcs(wstr, str, wlen+1);
3703 if (wlen2 == (size_t)-1) {
3704 if (wstr != smallbuf)
3705 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003706 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003707 }
3708#ifdef HAVE_BROKEN_MBSTOWCS
3709 assert(wlen2 == wlen);
3710#endif
3711 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3712 if (wstr != smallbuf)
3713 PyMem_Free(wstr);
3714 }
3715 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003716
3717decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003718 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003719 errmsg = strerror(errno);
3720 assert(errmsg != NULL);
3721
3722 error_pos = mbstowcs_errorpos(str, len);
3723 if (errmsg != NULL) {
3724 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003725 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003726 if (wstr != NULL) {
3727 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003728 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003729 }
Victor Stinner2f197072011-12-17 07:08:30 +01003730 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003731 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003732 reason = PyUnicode_FromString(
3733 "mbstowcs() encountered an invalid multibyte sequence");
3734 if (reason == NULL)
3735 return NULL;
3736
3737 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3738 "locale", str, len,
3739 (Py_ssize_t)error_pos,
3740 (Py_ssize_t)(error_pos+1),
3741 reason);
3742 Py_DECREF(reason);
3743 if (exc != NULL) {
3744 PyCodec_StrictErrors(exc);
3745 Py_XDECREF(exc);
3746 }
3747 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003748}
3749
3750PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003751PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003752{
3753 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003754 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755}
3756
3757
3758PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003759PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003760 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003761 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3762}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003763
Christian Heimes5894ba72007-11-04 11:43:14 +00003764PyObject*
3765PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3766{
Victor Stinner99b95382011-07-04 14:23:54 +02003767#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003768 return PyUnicode_DecodeMBCS(s, size, NULL);
3769#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003770 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003771#else
Victor Stinner793b5312011-04-27 00:24:21 +02003772 PyInterpreterState *interp = PyThreadState_GET()->interp;
3773 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3774 cannot use it to encode and decode filenames before it is loaded. Load
3775 the Python codec requires to encode at least its own filename. Use the C
3776 version of the locale codec until the codec registry is initialized and
3777 the Python codec is loaded.
3778
3779 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3780 cannot only rely on it: check also interp->fscodec_initialized for
3781 subinterpreters. */
3782 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003783 return PyUnicode_Decode(s, size,
3784 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003785 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003786 }
3787 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003788 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003789 }
Victor Stinnerad158722010-10-27 00:25:46 +00003790#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003791}
3792
Martin v. Löwis011e8422009-05-05 04:43:17 +00003793
3794int
3795PyUnicode_FSConverter(PyObject* arg, void* addr)
3796{
3797 PyObject *output = NULL;
3798 Py_ssize_t size;
3799 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003800 if (arg == NULL) {
3801 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003802 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003803 return 1;
3804 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003805 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003806 output = arg;
3807 Py_INCREF(output);
3808 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003809 else if (PyUnicode_Check(arg)) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003810 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003811 if (!output)
3812 return 0;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003813 assert(PyBytes_Check(output));
3814 }
3815 else {
3816 PyErr_Format(PyExc_TypeError,
3817 "must be str or bytes, not %.100s",
3818 Py_TYPE(arg)->tp_name);
3819 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003820 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003821 size = PyBytes_GET_SIZE(output);
3822 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003823 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003824 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003825 Py_DECREF(output);
3826 return 0;
3827 }
3828 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003829 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003830}
3831
3832
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003833int
3834PyUnicode_FSDecoder(PyObject* arg, void* addr)
3835{
3836 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003837 if (arg == NULL) {
3838 Py_DECREF(*(PyObject**)addr);
3839 return 1;
3840 }
3841 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003842 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003844 output = arg;
3845 Py_INCREF(output);
3846 }
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003847 else if (PyBytes_Check(arg) || PyObject_CheckBuffer(arg)) {
3848 if (!PyBytes_Check(arg) &&
3849 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3850 "path should be string or bytes, not %.200s",
3851 Py_TYPE(arg)->tp_name)) {
3852 return 0;
3853 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003854 arg = PyBytes_FromObject(arg);
3855 if (!arg)
3856 return 0;
3857 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3858 PyBytes_GET_SIZE(arg));
3859 Py_DECREF(arg);
3860 if (!output)
3861 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003862 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003863 else {
3864 PyErr_Format(PyExc_TypeError,
3865 "path should be string or bytes, not %.200s",
3866 Py_TYPE(arg)->tp_name);
3867 return 0;
3868 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003869 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003870 Py_DECREF(output);
3871 return 0;
3872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003874 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003875 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003876 Py_DECREF(output);
3877 return 0;
3878 }
3879 *(PyObject**)addr = output;
3880 return Py_CLEANUP_SUPPORTED;
3881}
3882
3883
Martin v. Löwis5b222132007-06-10 09:51:05 +00003884char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003886{
Christian Heimesf3863112007-11-22 07:46:41 +00003887 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003889 if (!PyUnicode_Check(unicode)) {
3890 PyErr_BadArgument();
3891 return NULL;
3892 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003893 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003894 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003896 if (PyUnicode_UTF8(unicode) == NULL) {
3897 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003898 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 if (bytes == NULL)
3900 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3902 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003903 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904 Py_DECREF(bytes);
3905 return NULL;
3906 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003907 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3908 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3909 PyBytes_AS_STRING(bytes),
3910 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 Py_DECREF(bytes);
3912 }
3913
3914 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003915 *psize = PyUnicode_UTF8_LENGTH(unicode);
3916 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003917}
3918
3919char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3923}
3924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925Py_UNICODE *
3926PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 const unsigned char *one_byte;
3929#if SIZEOF_WCHAR_T == 4
3930 const Py_UCS2 *two_bytes;
3931#else
3932 const Py_UCS4 *four_bytes;
3933 const Py_UCS4 *ucs4_end;
3934 Py_ssize_t num_surrogates;
3935#endif
3936 wchar_t *w;
3937 wchar_t *wchar_end;
3938
3939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 return NULL;
3942 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 assert(_PyUnicode_KIND(unicode) != 0);
3946 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3951 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 num_surrogates = 0;
3953
3954 for (; four_bytes < ucs4_end; ++four_bytes) {
3955 if (*four_bytes > 0xFFFF)
3956 ++num_surrogates;
3957 }
3958
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3960 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3961 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962 PyErr_NoMemory();
3963 return NULL;
3964 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003965 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003967 w = _PyUnicode_WSTR(unicode);
3968 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3969 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3971 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003972 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003974 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3975 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 }
3977 else
3978 *w = *four_bytes;
3979
3980 if (w > wchar_end) {
3981 assert(0 && "Miscalculated string end");
3982 }
3983 }
3984 *w = 0;
3985#else
3986 /* sizeof(wchar_t) == 4 */
3987 Py_FatalError("Impossible unicode object state, wstr and str "
3988 "should share memory already.");
3989 return NULL;
3990#endif
3991 }
3992 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003993 if ((size_t)_PyUnicode_LENGTH(unicode) >
3994 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3995 PyErr_NoMemory();
3996 return NULL;
3997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003998 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3999 (_PyUnicode_LENGTH(unicode) + 1));
4000 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 PyErr_NoMemory();
4002 return NULL;
4003 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004004 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4005 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4006 w = _PyUnicode_WSTR(unicode);
4007 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4010 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 for (; w < wchar_end; ++one_byte, ++w)
4012 *w = *one_byte;
4013 /* null-terminate the wstr */
4014 *w = 0;
4015 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004016 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004018 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 for (; w < wchar_end; ++two_bytes, ++w)
4020 *w = *two_bytes;
4021 /* null-terminate the wstr */
4022 *w = 0;
4023#else
4024 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004025 PyObject_FREE(_PyUnicode_WSTR(unicode));
4026 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 Py_FatalError("Impossible unicode object state, wstr "
4028 "and str should share memory already.");
4029 return NULL;
4030#endif
4031 }
4032 else {
4033 assert(0 && "This should never happen.");
4034 }
4035 }
4036 }
4037 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004038 *size = PyUnicode_WSTR_LENGTH(unicode);
4039 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004040}
4041
Alexander Belopolsky40018472011-02-26 01:02:56 +00004042Py_UNICODE *
4043PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046}
4047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048
Alexander Belopolsky40018472011-02-26 01:02:56 +00004049Py_ssize_t
4050PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051{
4052 if (!PyUnicode_Check(unicode)) {
4053 PyErr_BadArgument();
4054 goto onError;
4055 }
4056 return PyUnicode_GET_SIZE(unicode);
4057
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 return -1;
4060}
4061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062Py_ssize_t
4063PyUnicode_GetLength(PyObject *unicode)
4064{
Victor Stinner07621332012-06-16 04:53:46 +02004065 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 PyErr_BadArgument();
4067 return -1;
4068 }
Victor Stinner07621332012-06-16 04:53:46 +02004069 if (PyUnicode_READY(unicode) == -1)
4070 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 return PyUnicode_GET_LENGTH(unicode);
4072}
4073
4074Py_UCS4
4075PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4076{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004077 void *data;
4078 int kind;
4079
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004080 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4081 PyErr_BadArgument();
4082 return (Py_UCS4)-1;
4083 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004084 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004085 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 return (Py_UCS4)-1;
4087 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004088 data = PyUnicode_DATA(unicode);
4089 kind = PyUnicode_KIND(unicode);
4090 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091}
4092
4093int
4094PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4095{
4096 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004097 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 return -1;
4099 }
Victor Stinner488fa492011-12-12 00:01:39 +01004100 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004101 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004102 PyErr_SetString(PyExc_IndexError, "string index out of range");
4103 return -1;
4104 }
Victor Stinner488fa492011-12-12 00:01:39 +01004105 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004106 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004107 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4108 PyErr_SetString(PyExc_ValueError, "character out of range");
4109 return -1;
4110 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4112 index, ch);
4113 return 0;
4114}
4115
Alexander Belopolsky40018472011-02-26 01:02:56 +00004116const char *
4117PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004118{
Victor Stinner42cb4622010-09-01 19:39:01 +00004119 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004120}
4121
Victor Stinner554f3f02010-06-16 23:33:54 +00004122/* create or adjust a UnicodeDecodeError */
4123static void
4124make_decode_exception(PyObject **exceptionObject,
4125 const char *encoding,
4126 const char *input, Py_ssize_t length,
4127 Py_ssize_t startpos, Py_ssize_t endpos,
4128 const char *reason)
4129{
4130 if (*exceptionObject == NULL) {
4131 *exceptionObject = PyUnicodeDecodeError_Create(
4132 encoding, input, length, startpos, endpos, reason);
4133 }
4134 else {
4135 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4136 goto onError;
4137 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4138 goto onError;
4139 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4140 goto onError;
4141 }
4142 return;
4143
4144onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004145 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004146}
4147
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004148#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149/* error handling callback helper:
4150 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004151 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 and adjust various state variables.
4153 return 0 on success, -1 on error
4154*/
4155
Alexander Belopolsky40018472011-02-26 01:02:56 +00004156static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004157unicode_decode_call_errorhandler_wchar(
4158 const char *errors, PyObject **errorHandler,
4159 const char *encoding, const char *reason,
4160 const char **input, const char **inend, Py_ssize_t *startinpos,
4161 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4162 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004164 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165
4166 PyObject *restuple = NULL;
4167 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004168 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004170 Py_ssize_t requiredsize;
4171 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004173 wchar_t *repwstr;
4174 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4177 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 *errorHandler = PyCodec_LookupError(errors);
4181 if (*errorHandler == NULL)
4182 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 }
4184
Victor Stinner554f3f02010-06-16 23:33:54 +00004185 make_decode_exception(exceptionObject,
4186 encoding,
4187 *input, *inend - *input,
4188 *startinpos, *endinpos,
4189 reason);
4190 if (*exceptionObject == NULL)
4191 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192
4193 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4194 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004197 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 }
4200 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202
4203 /* Copy back the bytes variables, which might have been modified by the
4204 callback */
4205 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4206 if (!inputobj)
4207 goto onError;
4208 if (!PyBytes_Check(inputobj)) {
4209 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4210 }
4211 *input = PyBytes_AS_STRING(inputobj);
4212 insize = PyBytes_GET_SIZE(inputobj);
4213 *inend = *input + insize;
4214 /* we can DECREF safely, as the exception has another reference,
4215 so the object won't go away. */
4216 Py_DECREF(inputobj);
4217
4218 if (newpos<0)
4219 newpos = insize+newpos;
4220 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004221 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004222 goto onError;
4223 }
4224
4225 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4226 if (repwstr == NULL)
4227 goto onError;
4228 /* need more space? (at least enough for what we
4229 have+the replacement+the rest of the string (starting
4230 at the new input position), so we won't have to check space
4231 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004232 requiredsize = *outpos;
4233 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4234 goto overflow;
4235 requiredsize += repwlen;
4236 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4237 goto overflow;
4238 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004240 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 requiredsize = 2*outsize;
4242 if (unicode_resize(output, requiredsize) < 0)
4243 goto onError;
4244 }
4245 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4246 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247 *endinpos = newpos;
4248 *inptr = *input + newpos;
4249
4250 /* we made it! */
4251 Py_XDECREF(restuple);
4252 return 0;
4253
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004254 overflow:
4255 PyErr_SetString(PyExc_OverflowError,
4256 "decoded result is too long for a Python string");
4257
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258 onError:
4259 Py_XDECREF(restuple);
4260 return -1;
4261}
4262#endif /* HAVE_MBCS */
4263
4264static int
4265unicode_decode_call_errorhandler_writer(
4266 const char *errors, PyObject **errorHandler,
4267 const char *encoding, const char *reason,
4268 const char **input, const char **inend, Py_ssize_t *startinpos,
4269 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4270 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4271{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02004272 static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273
4274 PyObject *restuple = NULL;
4275 PyObject *repunicode = NULL;
4276 Py_ssize_t insize;
4277 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004278 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 PyObject *inputobj = NULL;
4280
4281 if (*errorHandler == NULL) {
4282 *errorHandler = PyCodec_LookupError(errors);
4283 if (*errorHandler == NULL)
4284 goto onError;
4285 }
4286
4287 make_decode_exception(exceptionObject,
4288 encoding,
4289 *input, *inend - *input,
4290 *startinpos, *endinpos,
4291 reason);
4292 if (*exceptionObject == NULL)
4293 goto onError;
4294
4295 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4296 if (restuple == NULL)
4297 goto onError;
4298 if (!PyTuple_Check(restuple)) {
4299 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4300 goto onError;
4301 }
4302 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004303 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004304
4305 /* Copy back the bytes variables, which might have been modified by the
4306 callback */
4307 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4308 if (!inputobj)
4309 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004310 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004312 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004313 *input = PyBytes_AS_STRING(inputobj);
4314 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004315 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004316 /* we can DECREF safely, as the exception has another reference,
4317 so the object won't go away. */
4318 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004322 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326
Victor Stinner8f674cc2013-04-17 23:02:17 +02004327 if (PyUnicode_READY(repunicode) < 0)
4328 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004329 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004330 if (replen > 1) {
4331 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004332 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004333 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4334 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4335 goto onError;
4336 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004338 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004340 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004341 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004343 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 Py_XDECREF(restuple);
4345 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004346
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350}
4351
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004352/* --- UTF-7 Codec -------------------------------------------------------- */
4353
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354/* See RFC2152 for details. We encode conservatively and decode liberally. */
4355
4356/* Three simple macros defining base-64. */
4357
4358/* Is c a base-64 character? */
4359
4360#define IS_BASE64(c) \
4361 (((c) >= 'A' && (c) <= 'Z') || \
4362 ((c) >= 'a' && (c) <= 'z') || \
4363 ((c) >= '0' && (c) <= '9') || \
4364 (c) == '+' || (c) == '/')
4365
4366/* given that c is a base-64 character, what is its base-64 value? */
4367
4368#define FROM_BASE64(c) \
4369 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4370 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4371 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4372 (c) == '+' ? 62 : 63)
4373
4374/* What is the base-64 character of the bottom 6 bits of n? */
4375
4376#define TO_BASE64(n) \
4377 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4378
4379/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4380 * decoded as itself. We are permissive on decoding; the only ASCII
4381 * byte not decoding to itself is the + which begins a base64
4382 * string. */
4383
4384#define DECODE_DIRECT(c) \
4385 ((c) <= 127 && (c) != '+')
4386
4387/* The UTF-7 encoder treats ASCII characters differently according to
4388 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4389 * the above). See RFC2152. This array identifies these different
4390 * sets:
4391 * 0 : "Set D"
4392 * alphanumeric and '(),-./:?
4393 * 1 : "Set O"
4394 * !"#$%&*;<=>@[]^_`{|}
4395 * 2 : "whitespace"
4396 * ht nl cr sp
4397 * 3 : special (must be base64 encoded)
4398 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4399 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400
Tim Petersced69f82003-09-16 20:30:58 +00004401static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402char utf7_category[128] = {
4403/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4404 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4405/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4406 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4407/* sp ! " # $ % & ' ( ) * + , - . / */
4408 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4409/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4410 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4411/* @ A B C D E F G H I J K L M N O */
4412 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4413/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4414 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4415/* ` a b c d e f g h i j k l m n o */
4416 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4417/* p q r s t u v w x y z { | } ~ del */
4418 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419};
4420
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421/* ENCODE_DIRECT: this character should be encoded as itself. The
4422 * answer depends on whether we are encoding set O as itself, and also
4423 * on whether we are encoding whitespace as itself. RFC2152 makes it
4424 * clear that the answers to these questions vary between
4425 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004426
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427#define ENCODE_DIRECT(c, directO, directWS) \
4428 ((c) < 128 && (c) > 0 && \
4429 ((utf7_category[(c)] == 0) || \
4430 (directWS && (utf7_category[(c)] == 2)) || \
4431 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432
Alexander Belopolsky40018472011-02-26 01:02:56 +00004433PyObject *
4434PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004435 Py_ssize_t size,
4436 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004438 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4439}
4440
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441/* The decoder. The only state we preserve is our read position,
4442 * i.e. how many characters we have consumed. So if we end in the
4443 * middle of a shift sequence we have to back off the read position
4444 * and the output to the beginning of the sequence, otherwise we lose
4445 * all the shift state (seen bits, number of bits seen, high
4446 * surrogate). */
4447
Alexander Belopolsky40018472011-02-26 01:02:56 +00004448PyObject *
4449PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004450 Py_ssize_t size,
4451 const char *errors,
4452 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004453{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004455 Py_ssize_t startinpos;
4456 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459 const char *errmsg = "";
4460 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004461 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 unsigned int base64bits = 0;
4463 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004464 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 PyObject *errorHandler = NULL;
4466 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 if (size == 0) {
4469 if (consumed)
4470 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004471 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004474 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004475 _PyUnicodeWriter_Init(&writer);
4476 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477
4478 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479 e = s + size;
4480
4481 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004482 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004484 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486 if (inShift) { /* in a base-64 section */
4487 if (IS_BASE64(ch)) { /* consume a base-64 character */
4488 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4489 base64bits += 6;
4490 s++;
4491 if (base64bits >= 16) {
4492 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004493 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 base64bits -= 16;
4495 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004496 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 if (surrogate) {
4498 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004499 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4500 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004501 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004502 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004504 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 }
4506 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004507 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004508 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 }
4511 }
Victor Stinner551ac952011-11-29 22:58:13 +01004512 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004513 /* first surrogate */
4514 surrogate = outCh;
4515 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004517 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004518 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 }
4520 }
4521 }
4522 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 if (base64bits > 0) { /* left-over bits */
4525 if (base64bits >= 6) {
4526 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004527 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 errmsg = "partial character in shift sequence";
4529 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 else {
4532 /* Some bits remain; they should be zero */
4533 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004534 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 errmsg = "non-zero padding bits in shift sequence";
4536 goto utf7Error;
4537 }
4538 }
4539 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004540 if (surrogate && DECODE_DIRECT(ch)) {
4541 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4542 goto onError;
4543 }
4544 surrogate = 0;
4545 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 /* '-' is absorbed; other terminating
4547 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004548 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 }
4551 }
4552 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 s++; /* consume '+' */
4555 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004557 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004558 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 }
4560 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004562 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004563 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004565 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566 }
4567 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004570 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 else {
4574 startinpos = s-starts;
4575 s++;
4576 errmsg = "unexpected special character";
4577 goto utf7Error;
4578 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004582 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 errors, &errorHandler,
4584 "utf7", errmsg,
4585 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004586 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588 }
4589
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 /* end of string */
4591
4592 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4593 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004594 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (surrogate ||
4596 (base64bits >= 6) ||
4597 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004598 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004599 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 errors, &errorHandler,
4601 "utf7", "unterminated shift sequence",
4602 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004603 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 goto onError;
4605 if (s < e)
4606 goto restart;
4607 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609
4610 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004611 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004613 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004614 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004615 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004616 writer.kind, writer.data, shiftOutStart);
4617 Py_XDECREF(errorHandler);
4618 Py_XDECREF(exc);
4619 _PyUnicodeWriter_Dealloc(&writer);
4620 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004621 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004622 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 }
4624 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004625 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004627 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 Py_XDECREF(errorHandler);
4630 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 Py_XDECREF(errorHandler);
4635 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004636 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 return NULL;
4638}
4639
4640
Alexander Belopolsky40018472011-02-26 01:02:56 +00004641PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004642_PyUnicode_EncodeUTF7(PyObject *str,
4643 int base64SetO,
4644 int base64WhiteSpace,
4645 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004647 int kind;
4648 void *data;
4649 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004650 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 unsigned int base64bits = 0;
4654 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 char * out;
4656 char * start;
4657
Benjamin Petersonbac79492012-01-14 13:34:47 -05004658 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004659 return NULL;
4660 kind = PyUnicode_KIND(str);
4661 data = PyUnicode_DATA(str);
4662 len = PyUnicode_GET_LENGTH(str);
4663
4664 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004667 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004668 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004669 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004670 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 if (v == NULL)
4672 return NULL;
4673
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004674 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004675 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004676 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 if (inShift) {
4679 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4680 /* shifting out */
4681 if (base64bits) { /* output remaining bits */
4682 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4683 base64buffer = 0;
4684 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004685 }
4686 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 /* Characters not in the BASE64 set implicitly unshift the sequence
4688 so no '-' is required, except if the character is itself a '-' */
4689 if (IS_BASE64(ch) || ch == '-') {
4690 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 *out++ = (char) ch;
4693 }
4694 else {
4695 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004696 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 else { /* not in a shift sequence */
4699 if (ch == '+') {
4700 *out++ = '+';
4701 *out++ = '-';
4702 }
4703 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4704 *out++ = (char) ch;
4705 }
4706 else {
4707 *out++ = '+';
4708 inShift = 1;
4709 goto encode_char;
4710 }
4711 }
4712 continue;
4713encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004715 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004716
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717 /* code first surrogate */
4718 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004719 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004720 while (base64bits >= 6) {
4721 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4722 base64bits -= 6;
4723 }
4724 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004725 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 base64bits += 16;
4728 base64buffer = (base64buffer << 16) | ch;
4729 while (base64bits >= 6) {
4730 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4731 base64bits -= 6;
4732 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004733 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 if (base64bits)
4735 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4736 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004737 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 if (_PyBytes_Resize(&v, out - start) < 0)
4739 return NULL;
4740 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004742PyObject *
4743PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4744 Py_ssize_t size,
4745 int base64SetO,
4746 int base64WhiteSpace,
4747 const char *errors)
4748{
4749 PyObject *result;
4750 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4751 if (tmp == NULL)
4752 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004753 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754 base64WhiteSpace, errors);
4755 Py_DECREF(tmp);
4756 return result;
4757}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759#undef IS_BASE64
4760#undef FROM_BASE64
4761#undef TO_BASE64
4762#undef DECODE_DIRECT
4763#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765/* --- UTF-8 Codec -------------------------------------------------------- */
4766
Alexander Belopolsky40018472011-02-26 01:02:56 +00004767PyObject *
4768PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004769 Py_ssize_t size,
4770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771{
Walter Dörwald69652032004-09-07 20:24:22 +00004772 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4773}
4774
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775#include "stringlib/asciilib.h"
4776#include "stringlib/codecs.h"
4777#include "stringlib/undef.h"
4778
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004779#include "stringlib/ucs1lib.h"
4780#include "stringlib/codecs.h"
4781#include "stringlib/undef.h"
4782
4783#include "stringlib/ucs2lib.h"
4784#include "stringlib/codecs.h"
4785#include "stringlib/undef.h"
4786
4787#include "stringlib/ucs4lib.h"
4788#include "stringlib/codecs.h"
4789#include "stringlib/undef.h"
4790
Antoine Pitrouab868312009-01-10 15:40:25 +00004791/* Mask to quickly check whether a C 'long' contains a
4792 non-ASCII, UTF8-encoded char. */
4793#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004794# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004795#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004796# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004797#else
4798# error C 'long' size should be either 4 or 8!
4799#endif
4800
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801static Py_ssize_t
4802ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004803{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004805 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004806
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004807 /*
4808 * Issue #17237: m68k is a bit different from most architectures in
4809 * that objects do not use "natural alignment" - for example, int and
4810 * long are only aligned at 2-byte boundaries. Therefore the assert()
4811 * won't work; also, tests have shown that skipping the "optimised
4812 * version" will even speed up m68k.
4813 */
4814#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004816 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4817 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 /* Fast path, see in STRINGLIB(utf8_decode) for
4819 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004820 /* Help allocation */
4821 const char *_p = p;
4822 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 while (_p < aligned_end) {
4824 unsigned long value = *(const unsigned long *) _p;
4825 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827 *((unsigned long *)q) = value;
4828 _p += SIZEOF_LONG;
4829 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004830 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831 p = _p;
4832 while (p < end) {
4833 if ((unsigned char)*p & 0x80)
4834 break;
4835 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004840#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 while (p < end) {
4842 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4843 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004844 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004845 /* Help allocation */
4846 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 while (_p < aligned_end) {
4848 unsigned long value = *(unsigned long *) _p;
4849 if (value & ASCII_CHAR_MASK)
4850 break;
4851 _p += SIZEOF_LONG;
4852 }
4853 p = _p;
4854 if (_p == end)
4855 break;
4856 }
4857 if ((unsigned char)*p & 0x80)
4858 break;
4859 ++p;
4860 }
4861 memcpy(dest, start, p - start);
4862 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863}
Antoine Pitrouab868312009-01-10 15:40:25 +00004864
Victor Stinner785938e2011-12-11 20:09:03 +01004865PyObject *
4866PyUnicode_DecodeUTF8Stateful(const char *s,
4867 Py_ssize_t size,
4868 const char *errors,
4869 Py_ssize_t *consumed)
4870{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004872 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874
4875 Py_ssize_t startinpos;
4876 Py_ssize_t endinpos;
4877 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004878 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004880 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004881
4882 if (size == 0) {
4883 if (consumed)
4884 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004885 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004886 }
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4889 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004890 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 *consumed = 1;
4892 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004893 }
4894
Victor Stinner8f674cc2013-04-17 23:02:17 +02004895 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004896 writer.min_length = size;
4897 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004898 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004899
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004900 writer.pos = ascii_decode(s, end, writer.data);
4901 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 while (s < end) {
4903 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004904 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004905
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004907 if (PyUnicode_IS_ASCII(writer.buffer))
4908 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004909 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004910 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 } else {
4914 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 }
4917
4918 switch (ch) {
4919 case 0:
4920 if (s == end || consumed)
4921 goto End;
4922 errmsg = "unexpected end of data";
4923 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004924 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 break;
4926 case 1:
4927 errmsg = "invalid start byte";
4928 startinpos = s - starts;
4929 endinpos = startinpos + 1;
4930 break;
4931 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004932 case 3:
4933 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 errmsg = "invalid continuation byte";
4935 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004936 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 break;
4938 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004939 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 goto onError;
4941 continue;
4942 }
4943
Victor Stinner1d65d912015-10-05 13:43:50 +02004944 if (error_handler == _Py_ERROR_UNKNOWN)
4945 error_handler = get_error_handler(errors);
4946
4947 switch (error_handler) {
4948 case _Py_ERROR_IGNORE:
4949 s += (endinpos - startinpos);
4950 break;
4951
4952 case _Py_ERROR_REPLACE:
4953 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4954 goto onError;
4955 s += (endinpos - startinpos);
4956 break;
4957
4958 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004959 {
4960 Py_ssize_t i;
4961
Victor Stinner1d65d912015-10-05 13:43:50 +02004962 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4963 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004964 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004965 ch = (Py_UCS4)(unsigned char)(starts[i]);
4966 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4967 ch + 0xdc00);
4968 writer.pos++;
4969 }
4970 s += (endinpos - startinpos);
4971 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004972 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004973
4974 default:
4975 if (unicode_decode_call_errorhandler_writer(
4976 errors, &error_handler_obj,
4977 "utf-8", errmsg,
4978 &starts, &end, &startinpos, &endinpos, &exc, &s,
4979 &writer))
4980 goto onError;
4981 }
Victor Stinner785938e2011-12-11 20:09:03 +01004982 }
4983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 if (consumed)
4986 *consumed = s - starts;
4987
Victor Stinner1d65d912015-10-05 13:43:50 +02004988 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991
4992onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02004993 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004997}
4998
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004999#ifdef __APPLE__
5000
5001/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005002 used to decode the command line arguments on Mac OS X.
5003
5004 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005005 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005006
5007wchar_t*
5008_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5009{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005010 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 wchar_t *unicode;
5012 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005013
5014 /* Note: size will always be longer than the resulting Unicode
5015 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005016 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005017 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005018 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005019 if (!unicode)
5020 return NULL;
5021
5022 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005023 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005025 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005026 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005029#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005031#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032 if (ch > 0xFF) {
5033#if SIZEOF_WCHAR_T == 4
5034 assert(0);
5035#else
5036 assert(Py_UNICODE_IS_SURROGATE(ch));
5037 /* compute and append the two surrogates: */
5038 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5039 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5040#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005041 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042 else {
5043 if (!ch && s == e)
5044 break;
5045 /* surrogateescape */
5046 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5047 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005048 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005049 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005050 return unicode;
5051}
5052
5053#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005055/* Primary internal function which creates utf8 encoded bytes objects.
5056
5057 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005058 and allocate exactly as much space needed at the end. Else allocate the
5059 maximum possible needed (4 result bytes per Unicode character), and return
5060 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005061*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005062PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005063_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064{
Victor Stinner6099a032011-12-18 14:22:26 +01005065 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005066 void *data;
5067 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005069 if (!PyUnicode_Check(unicode)) {
5070 PyErr_BadArgument();
5071 return NULL;
5072 }
5073
5074 if (PyUnicode_READY(unicode) == -1)
5075 return NULL;
5076
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005077 if (PyUnicode_UTF8(unicode))
5078 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5079 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005080
5081 kind = PyUnicode_KIND(unicode);
5082 data = PyUnicode_DATA(unicode);
5083 size = PyUnicode_GET_LENGTH(unicode);
5084
Benjamin Petersonead6b532011-12-20 17:23:42 -06005085 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005086 default:
5087 assert(0);
5088 case PyUnicode_1BYTE_KIND:
5089 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5090 assert(!PyUnicode_IS_ASCII(unicode));
5091 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5092 case PyUnicode_2BYTE_KIND:
5093 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5094 case PyUnicode_4BYTE_KIND:
5095 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097}
5098
Alexander Belopolsky40018472011-02-26 01:02:56 +00005099PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005100PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5101 Py_ssize_t size,
5102 const char *errors)
5103{
5104 PyObject *v, *unicode;
5105
5106 unicode = PyUnicode_FromUnicode(s, size);
5107 if (unicode == NULL)
5108 return NULL;
5109 v = _PyUnicode_AsUTF8String(unicode, errors);
5110 Py_DECREF(unicode);
5111 return v;
5112}
5113
5114PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005115PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005117 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118}
5119
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120/* --- UTF-32 Codec ------------------------------------------------------- */
5121
5122PyObject *
5123PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 Py_ssize_t size,
5125 const char *errors,
5126 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005127{
5128 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5129}
5130
5131PyObject *
5132PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 Py_ssize_t size,
5134 const char *errors,
5135 int *byteorder,
5136 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137{
5138 const char *starts = s;
5139 Py_ssize_t startinpos;
5140 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005141 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005142 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005143 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005144 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146 PyObject *errorHandler = NULL;
5147 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005148
Walter Dörwald41980ca2007-08-16 21:55:45 +00005149 q = (unsigned char *)s;
5150 e = q + size;
5151
5152 if (byteorder)
5153 bo = *byteorder;
5154
5155 /* Check for BOM marks (U+FEFF) in the input and adjust current
5156 byte order setting accordingly. In native mode, the leading BOM
5157 mark is skipped, in all other modes, it is copied to the output
5158 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005159 if (bo == 0 && size >= 4) {
5160 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5161 if (bom == 0x0000FEFF) {
5162 bo = -1;
5163 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005165 else if (bom == 0xFFFE0000) {
5166 bo = 1;
5167 q += 4;
5168 }
5169 if (byteorder)
5170 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005171 }
5172
Victor Stinnere64322e2012-10-30 23:12:47 +01005173 if (q == e) {
5174 if (consumed)
5175 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005176 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005177 }
5178
Victor Stinnere64322e2012-10-30 23:12:47 +01005179#ifdef WORDS_BIGENDIAN
5180 le = bo < 0;
5181#else
5182 le = bo <= 0;
5183#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005184 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005185
Victor Stinner8f674cc2013-04-17 23:02:17 +02005186 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005187 writer.min_length = (e - q + 3) / 4;
5188 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005189 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005190
Victor Stinnere64322e2012-10-30 23:12:47 +01005191 while (1) {
5192 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005193 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005194
Victor Stinnere64322e2012-10-30 23:12:47 +01005195 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005196 enum PyUnicode_Kind kind = writer.kind;
5197 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005198 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005199 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005200 if (le) {
5201 do {
5202 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5203 if (ch > maxch)
5204 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005205 if (kind != PyUnicode_1BYTE_KIND &&
5206 Py_UNICODE_IS_SURROGATE(ch))
5207 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005208 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005209 q += 4;
5210 } while (q <= last);
5211 }
5212 else {
5213 do {
5214 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5215 if (ch > maxch)
5216 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005217 if (kind != PyUnicode_1BYTE_KIND &&
5218 Py_UNICODE_IS_SURROGATE(ch))
5219 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005220 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005221 q += 4;
5222 } while (q <= last);
5223 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005224 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005225 }
5226
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005228 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005229 startinpos = ((const char *)q) - starts;
5230 endinpos = startinpos + 4;
5231 }
5232 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005233 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005234 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005235 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005237 startinpos = ((const char *)q) - starts;
5238 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005240 else {
5241 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005242 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005243 goto onError;
5244 q += 4;
5245 continue;
5246 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005247 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005248 startinpos = ((const char *)q) - starts;
5249 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005251
5252 /* The remaining input chars are ignored if the callback
5253 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005254 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005255 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005256 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005258 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005259 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005260 }
5261
Walter Dörwald41980ca2007-08-16 21:55:45 +00005262 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264
Walter Dörwald41980ca2007-08-16 21:55:45 +00005265 Py_XDECREF(errorHandler);
5266 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005267 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005270 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005271 Py_XDECREF(errorHandler);
5272 Py_XDECREF(exc);
5273 return NULL;
5274}
5275
5276PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005277_PyUnicode_EncodeUTF32(PyObject *str,
5278 const char *errors,
5279 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005280{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005281 enum PyUnicode_Kind kind;
5282 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005283 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005284 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005285 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005286#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005287 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005289 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005291 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005292 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005293 PyObject *errorHandler = NULL;
5294 PyObject *exc = NULL;
5295 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005296
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005297 if (!PyUnicode_Check(str)) {
5298 PyErr_BadArgument();
5299 return NULL;
5300 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005301 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005302 return NULL;
5303 kind = PyUnicode_KIND(str);
5304 data = PyUnicode_DATA(str);
5305 len = PyUnicode_GET_LENGTH(str);
5306
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005307 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005308 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005309 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005310 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005311 if (v == NULL)
5312 return NULL;
5313
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005314 /* output buffer is 4-bytes aligned */
5315 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5316 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005317 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005318 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005319 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005320 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005321
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005322 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005323 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005324 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005325 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005326 else
5327 encoding = "utf-32";
5328
5329 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005330 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5331 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005332 }
5333
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005334 pos = 0;
5335 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005336 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005337
5338 if (kind == PyUnicode_2BYTE_KIND) {
5339 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5340 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005341 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005342 else {
5343 assert(kind == PyUnicode_4BYTE_KIND);
5344 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5345 &out, native_ordering);
5346 }
5347 if (pos == len)
5348 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005349
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005350 rep = unicode_encode_call_errorhandler(
5351 errors, &errorHandler,
5352 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005353 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 if (!rep)
5355 goto error;
5356
5357 if (PyBytes_Check(rep)) {
5358 repsize = PyBytes_GET_SIZE(rep);
5359 if (repsize & 3) {
5360 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005361 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005362 "surrogates not allowed");
5363 goto error;
5364 }
5365 moreunits = repsize / 4;
5366 }
5367 else {
5368 assert(PyUnicode_Check(rep));
5369 if (PyUnicode_READY(rep) < 0)
5370 goto error;
5371 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5372 if (!PyUnicode_IS_ASCII(rep)) {
5373 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005374 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005375 "surrogates not allowed");
5376 goto error;
5377 }
5378 }
5379
5380 /* four bytes are reserved for each surrogate */
5381 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005382 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005383 Py_ssize_t morebytes = 4 * (moreunits - 1);
5384 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5385 /* integer overflow */
5386 PyErr_NoMemory();
5387 goto error;
5388 }
5389 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5390 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005392 }
5393
5394 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005395 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5396 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005397 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005399 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5400 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 }
5402
5403 Py_CLEAR(rep);
5404 }
5405
5406 /* Cut back to size actually needed. This is necessary for, for example,
5407 encoding of a string containing isolated surrogates and the 'ignore'
5408 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005409 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005410 if (nsize != PyBytes_GET_SIZE(v))
5411 _PyBytes_Resize(&v, nsize);
5412 Py_XDECREF(errorHandler);
5413 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005414 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005415 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005416 error:
5417 Py_XDECREF(rep);
5418 Py_XDECREF(errorHandler);
5419 Py_XDECREF(exc);
5420 Py_XDECREF(v);
5421 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005422}
5423
Alexander Belopolsky40018472011-02-26 01:02:56 +00005424PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5426 Py_ssize_t size,
5427 const char *errors,
5428 int byteorder)
5429{
5430 PyObject *result;
5431 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5432 if (tmp == NULL)
5433 return NULL;
5434 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5435 Py_DECREF(tmp);
5436 return result;
5437}
5438
5439PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005440PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005441{
Victor Stinnerb960b342011-11-20 19:12:52 +01005442 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005443}
5444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445/* --- UTF-16 Codec ------------------------------------------------------- */
5446
Tim Peters772747b2001-08-09 22:21:55 +00005447PyObject *
5448PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 Py_ssize_t size,
5450 const char *errors,
5451 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452{
Walter Dörwald69652032004-09-07 20:24:22 +00005453 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5454}
5455
5456PyObject *
5457PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 Py_ssize_t size,
5459 const char *errors,
5460 int *byteorder,
5461 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005462{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005464 Py_ssize_t startinpos;
5465 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005466 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005467 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005468 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005469 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005470 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 PyObject *errorHandler = NULL;
5472 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Tim Peters772747b2001-08-09 22:21:55 +00005475 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005476 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
5478 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005479 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005481 /* Check for BOM marks (U+FEFF) in the input and adjust current
5482 byte order setting accordingly. In native mode, the leading BOM
5483 mark is skipped, in all other modes, it is copied to the output
5484 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005485 if (bo == 0 && size >= 2) {
5486 const Py_UCS4 bom = (q[1] << 8) | q[0];
5487 if (bom == 0xFEFF) {
5488 q += 2;
5489 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005491 else if (bom == 0xFFFE) {
5492 q += 2;
5493 bo = 1;
5494 }
5495 if (byteorder)
5496 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
Antoine Pitrou63065d72012-05-15 23:48:04 +02005499 if (q == e) {
5500 if (consumed)
5501 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005502 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005503 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005504
Christian Heimes743e0cd2012-10-17 23:52:17 +02005505#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005506 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005508#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005509 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005511#endif
Tim Peters772747b2001-08-09 22:21:55 +00005512
Antoine Pitrou63065d72012-05-15 23:48:04 +02005513 /* Note: size will always be longer than the resulting Unicode
5514 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005515 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005516 writer.min_length = (e - q + 1) / 2;
5517 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005518 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005519
Antoine Pitrou63065d72012-05-15 23:48:04 +02005520 while (1) {
5521 Py_UCS4 ch = 0;
5522 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005523 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005524 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005525 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005526 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005527 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005528 native_ordering);
5529 else
5530 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005531 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005532 native_ordering);
5533 } else if (kind == PyUnicode_2BYTE_KIND) {
5534 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005535 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005536 native_ordering);
5537 } else {
5538 assert(kind == PyUnicode_4BYTE_KIND);
5539 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005540 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005541 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005542 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005543 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544
Antoine Pitrou63065d72012-05-15 23:48:04 +02005545 switch (ch)
5546 {
5547 case 0:
5548 /* remaining byte at the end? (size should be even) */
5549 if (q == e || consumed)
5550 goto End;
5551 errmsg = "truncated data";
5552 startinpos = ((const char *)q) - starts;
5553 endinpos = ((const char *)e) - starts;
5554 break;
5555 /* The remaining input chars are ignored if the callback
5556 chooses to skip the input */
5557 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005558 q -= 2;
5559 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005560 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005561 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005562 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005563 endinpos = ((const char *)e) - starts;
5564 break;
5565 case 2:
5566 errmsg = "illegal encoding";
5567 startinpos = ((const char *)q) - 2 - starts;
5568 endinpos = startinpos + 2;
5569 break;
5570 case 3:
5571 errmsg = "illegal UTF-16 surrogate";
5572 startinpos = ((const char *)q) - 4 - starts;
5573 endinpos = startinpos + 2;
5574 break;
5575 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005576 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 continue;
5579 }
5580
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005581 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005582 errors,
5583 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005584 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005585 &starts,
5586 (const char **)&e,
5587 &startinpos,
5588 &endinpos,
5589 &exc,
5590 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 }
5594
Antoine Pitrou63065d72012-05-15 23:48:04 +02005595End:
Walter Dörwald69652032004-09-07 20:24:22 +00005596 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005599 Py_XDECREF(errorHandler);
5600 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005601 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005604 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605 Py_XDECREF(errorHandler);
5606 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 return NULL;
5608}
5609
Tim Peters772747b2001-08-09 22:21:55 +00005610PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005611_PyUnicode_EncodeUTF16(PyObject *str,
5612 const char *errors,
5613 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005615 enum PyUnicode_Kind kind;
5616 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005617 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005618 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005619 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005620 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005621#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005622 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005623#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005624 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005625#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005626 const char *encoding;
5627 Py_ssize_t nsize, pos;
5628 PyObject *errorHandler = NULL;
5629 PyObject *exc = NULL;
5630 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005631
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 if (!PyUnicode_Check(str)) {
5633 PyErr_BadArgument();
5634 return NULL;
5635 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005636 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 return NULL;
5638 kind = PyUnicode_KIND(str);
5639 data = PyUnicode_DATA(str);
5640 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005641
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005642 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005643 if (kind == PyUnicode_4BYTE_KIND) {
5644 const Py_UCS4 *in = (const Py_UCS4 *)data;
5645 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005646 while (in < end) {
5647 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005649 }
5650 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005651 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005652 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005653 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005654 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005655 nsize = len + pairs + (byteorder == 0);
5656 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005657 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005661 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005662 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005663 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005664 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005665 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005666 }
5667 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005668 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005669 }
Tim Peters772747b2001-08-09 22:21:55 +00005670
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 if (kind == PyUnicode_1BYTE_KIND) {
5672 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5673 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005674 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005675
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005676 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005677 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005678 }
5679 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005681 }
5682 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005683 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005684 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005685
5686 pos = 0;
5687 while (pos < len) {
5688 Py_ssize_t repsize, moreunits;
5689
5690 if (kind == PyUnicode_2BYTE_KIND) {
5691 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5692 &out, native_ordering);
5693 }
5694 else {
5695 assert(kind == PyUnicode_4BYTE_KIND);
5696 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5697 &out, native_ordering);
5698 }
5699 if (pos == len)
5700 break;
5701
5702 rep = unicode_encode_call_errorhandler(
5703 errors, &errorHandler,
5704 encoding, "surrogates not allowed",
5705 str, &exc, pos, pos + 1, &pos);
5706 if (!rep)
5707 goto error;
5708
5709 if (PyBytes_Check(rep)) {
5710 repsize = PyBytes_GET_SIZE(rep);
5711 if (repsize & 1) {
5712 raise_encode_exception(&exc, encoding,
5713 str, pos - 1, pos,
5714 "surrogates not allowed");
5715 goto error;
5716 }
5717 moreunits = repsize / 2;
5718 }
5719 else {
5720 assert(PyUnicode_Check(rep));
5721 if (PyUnicode_READY(rep) < 0)
5722 goto error;
5723 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5724 if (!PyUnicode_IS_ASCII(rep)) {
5725 raise_encode_exception(&exc, encoding,
5726 str, pos - 1, pos,
5727 "surrogates not allowed");
5728 goto error;
5729 }
5730 }
5731
5732 /* two bytes are reserved for each surrogate */
5733 if (moreunits > 1) {
5734 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5735 Py_ssize_t morebytes = 2 * (moreunits - 1);
5736 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5737 /* integer overflow */
5738 PyErr_NoMemory();
5739 goto error;
5740 }
5741 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5742 goto error;
5743 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5744 }
5745
5746 if (PyBytes_Check(rep)) {
5747 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5748 out += moreunits;
5749 } else /* rep is unicode */ {
5750 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5751 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5752 &out, native_ordering);
5753 }
5754
5755 Py_CLEAR(rep);
5756 }
5757
5758 /* Cut back to size actually needed. This is necessary for, for example,
5759 encoding of a string containing isolated surrogates and the 'ignore' handler
5760 is used. */
5761 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5762 if (nsize != PyBytes_GET_SIZE(v))
5763 _PyBytes_Resize(&v, nsize);
5764 Py_XDECREF(errorHandler);
5765 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005766 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005767 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 error:
5769 Py_XDECREF(rep);
5770 Py_XDECREF(errorHandler);
5771 Py_XDECREF(exc);
5772 Py_XDECREF(v);
5773 return NULL;
5774#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775}
5776
Alexander Belopolsky40018472011-02-26 01:02:56 +00005777PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005778PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5779 Py_ssize_t size,
5780 const char *errors,
5781 int byteorder)
5782{
5783 PyObject *result;
5784 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5785 if (tmp == NULL)
5786 return NULL;
5787 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5788 Py_DECREF(tmp);
5789 return result;
5790}
5791
5792PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005793PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796}
5797
5798/* --- Unicode Escape Codec ----------------------------------------------- */
5799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005800/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5801 if all the escapes in the string make it still a valid ASCII string.
5802 Returns -1 if any escapes were found which cause the string to
5803 pop out of ASCII range. Otherwise returns the length of the
5804 required buffer to hold the string.
5805 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005806static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005807length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5808{
5809 const unsigned char *p = (const unsigned char *)s;
5810 const unsigned char *end = p + size;
5811 Py_ssize_t length = 0;
5812
5813 if (size < 0)
5814 return -1;
5815
5816 for (; p < end; ++p) {
5817 if (*p > 127) {
5818 /* Non-ASCII */
5819 return -1;
5820 }
5821 else if (*p != '\\') {
5822 /* Normal character */
5823 ++length;
5824 }
5825 else {
5826 /* Backslash-escape, check next char */
5827 ++p;
5828 /* Escape sequence reaches till end of string or
5829 non-ASCII follow-up. */
5830 if (p >= end || *p > 127)
5831 return -1;
5832 switch (*p) {
5833 case '\n':
5834 /* backslash + \n result in zero characters */
5835 break;
5836 case '\\': case '\'': case '\"':
5837 case 'b': case 'f': case 't':
5838 case 'n': case 'r': case 'v': case 'a':
5839 ++length;
5840 break;
5841 case '0': case '1': case '2': case '3':
5842 case '4': case '5': case '6': case '7':
5843 case 'x': case 'u': case 'U': case 'N':
5844 /* these do not guarantee ASCII characters */
5845 return -1;
5846 default:
5847 /* count the backslash + the other character */
5848 length += 2;
5849 }
5850 }
5851 }
5852 return length;
5853}
5854
Fredrik Lundh06d12682001-01-24 07:59:11 +00005855static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
5858PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005859 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t startinpos;
5864 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005865 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005867 char* message;
5868 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 PyObject *errorHandler = NULL;
5870 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005871 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005872
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005873 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005874 if (len == 0)
5875 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876
5877 /* After length_of_escaped_ascii_string() there are two alternatives,
5878 either the string is pure ASCII with named escapes like \n, etc.
5879 and we determined it's exact size (common case)
5880 or it contains \x, \u, ... escape sequences. then we create a
5881 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005882 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005884 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005885 }
5886 else {
5887 /* Escaped strings will always be longer than the resulting
5888 Unicode string, so we start with size here and then reduce the
5889 length after conversion to the true value.
5890 (but if the error callback returns a long replacement string
5891 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005892 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893 }
5894
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005898
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 while (s < end) {
5900 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005901 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
5904 /* Non-escape characters are interpreted as Unicode ordinals */
5905 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906 x = (unsigned char)*s;
5907 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005908 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005909 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 continue;
5911 }
5912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 /* \ - Escapes */
5915 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005916 c = *s++;
5917 if (s > end)
5918 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005919
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005920 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005923#define WRITECHAR(ch) \
5924 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005925 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005926 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005927 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 case '\\': WRITECHAR('\\'); break;
5931 case '\'': WRITECHAR('\''); break;
5932 case '\"': WRITECHAR('\"'); break;
5933 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005935 case 'f': WRITECHAR('\014'); break;
5936 case 't': WRITECHAR('\t'); break;
5937 case 'n': WRITECHAR('\n'); break;
5938 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005939 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005940 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005941 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005942 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 case '0': case '1': case '2': case '3':
5946 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005947 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005948 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005949 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005950 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005951 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005953 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 break;
5955
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 /* hex escapes */
5957 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005959 digits = 2;
5960 message = "truncated \\xXX escape";
5961 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005965 digits = 4;
5966 message = "truncated \\uXXXX escape";
5967 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005970 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005971 digits = 8;
5972 message = "truncated \\UXXXXXXXX escape";
5973 hexescape:
5974 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005975 if (end - s < digits) {
5976 /* count only hex digits */
5977 for (; s < end; ++s) {
5978 c = (unsigned char)*s;
5979 if (!Py_ISXDIGIT(c))
5980 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005981 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005982 goto error;
5983 }
5984 for (; digits--; ++s) {
5985 c = (unsigned char)*s;
5986 if (!Py_ISXDIGIT(c))
5987 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005988 chr = (chr<<4) & ~0xF;
5989 if (c >= '0' && c <= '9')
5990 chr += c - '0';
5991 else if (c >= 'a' && c <= 'f')
5992 chr += 10 + c - 'a';
5993 else
5994 chr += 10 + c - 'A';
5995 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005996 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997 /* _decoding_error will have already written into the
5998 target buffer. */
5999 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006000 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006001 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006002 message = "illegal Unicode character";
6003 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02006004 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02006005 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006006 break;
6007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006009 case 'N':
6010 message = "malformed \\N character escape";
6011 if (ucnhash_CAPI == NULL) {
6012 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006013 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6014 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006015 if (ucnhash_CAPI == NULL)
6016 goto ucnhashError;
6017 }
6018 if (*s == '{') {
6019 const char *start = s+1;
6020 /* look for the closing brace */
6021 while (*s != '}' && s < end)
6022 s++;
6023 if (s > start && s < end && *s == '}') {
6024 /* found a name. look it up in the unicode database */
6025 message = "unknown Unicode character name";
6026 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02006027 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02006028 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006029 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006030 goto store;
6031 }
6032 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006033 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006034
6035 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006036 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 message = "\\ at end of string";
6038 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02006039 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00006040 }
6041 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006042 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02006043 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006044 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006045 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006047 continue;
6048
6049 error:
6050 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006051 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006052 errors, &errorHandler,
6053 "unicodeescape", message,
6054 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006055 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02006056 goto onError;
6057 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006059#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006060
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006061 Py_XDECREF(errorHandler);
6062 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006063 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006064
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006066 PyErr_SetString(
6067 PyExc_UnicodeError,
6068 "\\N escapes not supported (can't load unicodedata module)"
6069 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 Py_XDECREF(errorHandler);
6072 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006073 return NULL;
6074
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006076 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077 Py_XDECREF(errorHandler);
6078 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 return NULL;
6080}
6081
6082/* Return a Unicode-Escape string version of the Unicode object.
6083
6084 If quotes is true, the string is enclosed in u"" or u'' quotes as
6085 appropriate.
6086
6087*/
6088
Alexander Belopolsky40018472011-02-26 01:02:56 +00006089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006090PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006092 Py_ssize_t i, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006094 int kind;
6095 void *data;
Victor Stinner358af132015-10-12 22:36:57 +02006096 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097
Ezio Melottie7f90372012-10-05 03:33:31 +03006098 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006099 escape.
6100
Ezio Melottie7f90372012-10-05 03:33:31 +03006101 For UCS1 strings it's '\xxx', 4 bytes per source character.
6102 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6103 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006104 */
6105
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006106 if (!PyUnicode_Check(unicode)) {
6107 PyErr_BadArgument();
6108 return NULL;
6109 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006110 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006111 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006112
6113 _PyBytesWriter_Init(&writer);
6114
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006115 len = PyUnicode_GET_LENGTH(unicode);
6116 kind = PyUnicode_KIND(unicode);
6117 data = PyUnicode_DATA(unicode);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118
Victor Stinner358af132015-10-12 22:36:57 +02006119 p = _PyBytesWriter_Alloc(&writer, len);
6120 if (p == NULL)
6121 goto error;
6122 writer.overallocate = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006125 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006126
Walter Dörwald79e913e2007-05-12 11:08:06 +00006127 /* Escape backslashes */
6128 if (ch == '\\') {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006129 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006130 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6131 if (p == NULL)
6132 goto error;
6133
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 *p++ = '\\';
6135 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006136 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006137 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006138
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006139 /* Map 21-bit characters to '\U00xxxxxx' */
6140 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006141 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006142
6143 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6144 if (p == NULL)
6145 goto error;
6146
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006147 *p++ = '\\';
6148 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006149 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6150 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6151 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6152 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6153 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6154 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6155 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6156 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006158 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006161 if (ch >= 256) {
Victor Stinner358af132015-10-12 22:36:57 +02006162 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6163 if (p == NULL)
6164 goto error;
6165
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 *p++ = '\\';
6167 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006168 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6169 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6170 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6171 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006173
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006174 /* Map special whitespace to '\t', \n', '\r' */
6175 else if (ch == '\t') {
Victor Stinner358af132015-10-12 22:36:57 +02006176 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6177 if (p == NULL)
6178 goto error;
6179
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006180 *p++ = '\\';
6181 *p++ = 't';
6182 }
6183 else if (ch == '\n') {
Victor Stinner358af132015-10-12 22:36:57 +02006184 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6185 if (p == NULL)
6186 goto error;
6187
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006188 *p++ = '\\';
6189 *p++ = 'n';
6190 }
6191 else if (ch == '\r') {
Victor Stinner358af132015-10-12 22:36:57 +02006192 p = _PyBytesWriter_Prepare(&writer, p, 2-1);
6193 if (p == NULL)
6194 goto error;
6195
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006196 *p++ = '\\';
6197 *p++ = 'r';
6198 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006199
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006200 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006201 else if (ch < ' ' || ch >= 0x7F) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006202 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006203 p = _PyBytesWriter_Prepare(&writer, p, 4-1);
6204 if (p == NULL)
6205 goto error;
6206
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006208 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006209 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6210 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006211 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006212
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 /* Copy everything else as-is */
6214 else
6215 *p++ = (char) ch;
6216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
Victor Stinner358af132015-10-12 22:36:57 +02006218 return _PyBytesWriter_Finish(&writer, p);
6219
6220error:
6221 _PyBytesWriter_Dealloc(&writer);
6222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223}
6224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6227 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 PyObject *result;
6230 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6231 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233 result = PyUnicode_AsUnicodeEscapeString(tmp);
6234 Py_DECREF(tmp);
6235 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236}
6237
6238/* --- Raw Unicode Escape Codec ------------------------------------------- */
6239
Alexander Belopolsky40018472011-02-26 01:02:56 +00006240PyObject *
6241PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006242 Py_ssize_t size,
6243 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006245 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t startinpos;
6247 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006248 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 const char *end;
6250 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006251 PyObject *errorHandler = NULL;
6252 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006253
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006254 if (size == 0)
6255 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006256
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 /* Escaped strings will always be longer than the resulting
6258 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259 length after conversion to the true value. (But decoding error
6260 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006261 _PyUnicodeWriter_Init(&writer);
6262 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 end = s + size;
6265 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 unsigned char c;
6267 Py_UCS4 x;
6268 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006269 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 /* Non-escape characters are interpreted as Unicode ordinals */
6272 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006273 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006274 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006275 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006277 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 startinpos = s-starts;
6279
6280 /* \u-escapes are only interpreted iff the number of leading
6281 backslashes if odd */
6282 bs = s;
6283 for (;s < end;) {
6284 if (*s != '\\')
6285 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006286 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006287 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006288 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 }
6290 if (((s - bs) & 1) == 0 ||
6291 s >= end ||
6292 (*s != 'u' && *s != 'U')) {
6293 continue;
6294 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006295 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006296 count = *s=='u' ? 4 : 8;
6297 s++;
6298
6299 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 for (x = 0, i = 0; i < count; ++i, ++s) {
6301 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006302 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 errors, &errorHandler,
6306 "rawunicodeescape", "truncated \\uXXXX",
6307 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 goto onError;
6310 goto nextByte;
6311 }
6312 x = (x<<4) & ~0xF;
6313 if (c >= '0' && c <= '9')
6314 x += c - '0';
6315 else if (c >= 'a' && c <= 'f')
6316 x += 10 + c - 'a';
6317 else
6318 x += 10 + c - 'A';
6319 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006320 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006321 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006322 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006323 }
6324 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006325 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006326 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006327 errors, &errorHandler,
6328 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006330 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006332 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 nextByte:
6334 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006336 Py_XDECREF(errorHandler);
6337 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006338 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006339
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006341 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 Py_XDECREF(errorHandler);
6343 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 return NULL;
6345}
6346
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006347
Alexander Belopolsky40018472011-02-26 01:02:56 +00006348PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006349PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 char *p;
Victor Stinner358af132015-10-12 22:36:57 +02006352 Py_ssize_t pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006353 int kind;
6354 void *data;
6355 Py_ssize_t len;
Victor Stinner358af132015-10-12 22:36:57 +02006356 _PyBytesWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006358 if (!PyUnicode_Check(unicode)) {
6359 PyErr_BadArgument();
6360 return NULL;
6361 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006362 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006363 return NULL;
Victor Stinner358af132015-10-12 22:36:57 +02006364
6365 _PyBytesWriter_Init(&writer);
6366
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006367 kind = PyUnicode_KIND(unicode);
6368 data = PyUnicode_DATA(unicode);
6369 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner0e368262011-11-10 20:12:49 +01006370
Victor Stinner358af132015-10-12 22:36:57 +02006371 p = _PyBytesWriter_Alloc(&writer, len);
6372 if (p == NULL)
6373 goto error;
6374 writer.overallocate = 1;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006375
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376 for (pos = 0; pos < len; pos++) {
6377 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 /* Map 32-bit characters to '\Uxxxxxxxx' */
6379 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006380 assert(ch <= MAX_UNICODE);
Victor Stinner358af132015-10-12 22:36:57 +02006381
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006382 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006383 p = _PyBytesWriter_Prepare(&writer, p, 10-1);
6384 if (p == NULL)
6385 goto error;
6386
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006387 *p++ = '\\';
6388 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006389 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6390 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6391 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6392 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6393 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6394 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6395 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6396 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006399 else if (ch >= 256) {
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006400 /* -1: subtract 1 preallocated byte */
Victor Stinner358af132015-10-12 22:36:57 +02006401 p = _PyBytesWriter_Prepare(&writer, p, 6-1);
6402 if (p == NULL)
6403 goto error;
6404
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 *p++ = '\\';
6406 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006407 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6408 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6409 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6410 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 /* Copy everything else as-is */
6413 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 *p++ = (char) ch;
6415 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006416
Victor Stinner358af132015-10-12 22:36:57 +02006417 return _PyBytesWriter_Finish(&writer, p);
6418
6419error:
6420 _PyBytesWriter_Dealloc(&writer);
6421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422}
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6426 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428 PyObject *result;
6429 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6430 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006431 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006432 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6433 Py_DECREF(tmp);
6434 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435}
6436
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006437/* --- Unicode Internal Codec ------------------------------------------- */
6438
Alexander Belopolsky40018472011-02-26 01:02:56 +00006439PyObject *
6440_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006441 Py_ssize_t size,
6442 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006443{
6444 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006445 Py_ssize_t startinpos;
6446 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006447 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006448 const char *end;
6449 const char *reason;
6450 PyObject *errorHandler = NULL;
6451 PyObject *exc = NULL;
6452
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006453 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006454 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006455 1))
6456 return NULL;
6457
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006458 if (size == 0)
6459 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460
Victor Stinner8f674cc2013-04-17 23:02:17 +02006461 _PyUnicodeWriter_Init(&writer);
6462 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6463 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006465 }
6466 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006467
Victor Stinner8f674cc2013-04-17 23:02:17 +02006468 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006469 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006470 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006471 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006472 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006473 endinpos = end-starts;
6474 reason = "truncated input";
6475 goto error;
6476 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006477 /* We copy the raw representation one byte at a time because the
6478 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006479 ((char *) &uch)[0] = s[0];
6480 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006481#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006482 ((char *) &uch)[2] = s[2];
6483 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006484#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006485 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006486#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006487 /* We have to sanity check the raw data, otherwise doom looms for
6488 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006489 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006490 endinpos = s - starts + Py_UNICODE_SIZE;
6491 reason = "illegal code point (> 0x10FFFF)";
6492 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006493 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006494#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006495 s += Py_UNICODE_SIZE;
6496#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006497 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006498 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006499 Py_UNICODE uch2;
6500 ((char *) &uch2)[0] = s[0];
6501 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006502 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006503 {
Victor Stinner551ac952011-11-29 22:58:13 +01006504 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006505 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006506 }
6507 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006508#endif
6509
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006510 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006511 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006512 continue;
6513
6514 error:
6515 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006516 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006517 errors, &errorHandler,
6518 "unicode_internal", reason,
6519 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006520 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006521 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006522 }
6523
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006524 Py_XDECREF(errorHandler);
6525 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006526 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006527
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006530 Py_XDECREF(errorHandler);
6531 Py_XDECREF(exc);
6532 return NULL;
6533}
6534
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535/* --- Latin-1 Codec ------------------------------------------------------ */
6536
Alexander Belopolsky40018472011-02-26 01:02:56 +00006537PyObject *
6538PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006539 Py_ssize_t size,
6540 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006543 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544}
6545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006546/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006547static void
6548make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006549 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006550 PyObject *unicode,
6551 Py_ssize_t startpos, Py_ssize_t endpos,
6552 const char *reason)
6553{
6554 if (*exceptionObject == NULL) {
6555 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006557 encoding, unicode, startpos, endpos, reason);
6558 }
6559 else {
6560 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6561 goto onError;
6562 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6563 goto onError;
6564 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6565 goto onError;
6566 return;
6567 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006568 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006569 }
6570}
6571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006573static void
6574raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006575 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006576 PyObject *unicode,
6577 Py_ssize_t startpos, Py_ssize_t endpos,
6578 const char *reason)
6579{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006580 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006581 encoding, unicode, startpos, endpos, reason);
6582 if (*exceptionObject != NULL)
6583 PyCodec_StrictErrors(*exceptionObject);
6584}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006585
6586/* error handling callback helper:
6587 build arguments, call the callback and check the arguments,
6588 put the result into newpos and return the replacement string, which
6589 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006590static PyObject *
6591unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006592 PyObject **errorHandler,
6593 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006595 Py_ssize_t startpos, Py_ssize_t endpos,
6596 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006598 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006599 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 PyObject *restuple;
6601 PyObject *resunicode;
6602
6603 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006605 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607 }
6608
Benjamin Petersonbac79492012-01-14 13:34:47 -05006609 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610 return NULL;
6611 len = PyUnicode_GET_LENGTH(unicode);
6612
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006613 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006617
6618 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006623 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 Py_DECREF(restuple);
6625 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006627 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 &resunicode, newpos)) {
6629 Py_DECREF(restuple);
6630 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006632 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6633 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6634 Py_DECREF(restuple);
6635 return NULL;
6636 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638 *newpos = len + *newpos;
6639 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006640 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 Py_DECREF(restuple);
6642 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 Py_INCREF(resunicode);
6645 Py_DECREF(restuple);
6646 return resunicode;
6647}
6648
Alexander Belopolsky40018472011-02-26 01:02:56 +00006649static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006651 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006652 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006654 /* input state */
6655 Py_ssize_t pos=0, size;
6656 int kind;
6657 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 /* pointer into the output */
6659 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006660 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6661 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006662 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006664 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006665 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006666 /* output object */
6667 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668
Benjamin Petersonbac79492012-01-14 13:34:47 -05006669 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 return NULL;
6671 size = PyUnicode_GET_LENGTH(unicode);
6672 kind = PyUnicode_KIND(unicode);
6673 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674 /* allocate enough for a simple encoding without
6675 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006676 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006677 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006678
6679 _PyBytesWriter_Init(&writer);
6680 str = _PyBytesWriter_Alloc(&writer, size);
6681 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006683
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006685 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006688 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006690 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006691 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006694 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006696 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006697 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006699
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006700 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006702
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006703 /* Only overallocate the buffer if it's not the last write */
6704 writer.overallocate = (collend < size);
6705
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006707 if (error_handler == _Py_ERROR_UNKNOWN)
6708 error_handler = get_error_handler(errors);
6709
6710 switch (error_handler) {
6711 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006712 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006714
6715 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006716 memset(str, '?', collend - collstart);
6717 str += (collend - collstart);
Victor Stinner0030cd52015-09-24 14:45:00 +02006718 /* fall through ignore error handler */
Victor Stinner50149202015-09-22 00:26:54 +02006719 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 break;
Victor Stinner50149202015-09-22 00:26:54 +02006722
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006723 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006724 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006725 writer.min_size -= (collend - collstart);
6726 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006727 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006728 if (str == NULL)
6729 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006730 pos = collend;
6731 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006732
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006733 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006734 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006735 writer.min_size -= (collend - collstart);
6736 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006737 unicode, collstart, collend);
6738 if (str == NULL)
6739 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 break;
Victor Stinner50149202015-09-22 00:26:54 +02006742
Victor Stinnerc3713e92015-09-29 12:32:13 +02006743 case _Py_ERROR_SURROGATEESCAPE:
6744 for (i = collstart; i < collend; ++i) {
6745 ch = PyUnicode_READ(kind, data, i);
6746 if (ch < 0xdc80 || 0xdcff < ch) {
6747 /* Not a UTF-8b surrogate */
6748 break;
6749 }
6750 *str++ = (char)(ch - 0xdc00);
6751 ++pos;
6752 }
6753 if (i >= collend)
6754 break;
6755 collstart = pos;
6756 assert(collstart != collend);
6757 /* fallback to general error handling */
6758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006760 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6761 encoding, reason, unicode, &exc,
6762 collstart, collend, &newpos);
6763 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006765
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006766 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006767 writer.min_size -= 1;
6768
Victor Stinner6bd525b2015-10-09 13:10:05 +02006769 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006770 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006771 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006772 PyBytes_AS_STRING(rep),
6773 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006774 if (str == NULL)
6775 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006776 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006777 else {
6778 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006779
Victor Stinner6bd525b2015-10-09 13:10:05 +02006780 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006782
6783 if (PyUnicode_IS_ASCII(rep)) {
6784 /* Fast path: all characters are smaller than limit */
6785 assert(limit >= 128);
6786 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6787 str = _PyBytesWriter_WriteBytes(&writer, str,
6788 PyUnicode_DATA(rep),
6789 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006791 else {
6792 Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
6793
6794 str = _PyBytesWriter_Prepare(&writer, str, repsize);
6795 if (str == NULL)
6796 goto onError;
6797
6798 /* check if there is anything unencodable in the
6799 replacement and copy it to the output */
6800 for (i = 0; repsize-->0; ++i, ++str) {
6801 ch = PyUnicode_READ_CHAR(rep, i);
6802 if (ch >= limit) {
6803 raise_encode_exception(&exc, encoding, unicode,
6804 pos, pos+1, reason);
6805 goto onError;
6806 }
6807 *str = (char)ch;
6808 }
6809 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006812 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006813 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006814
6815 /* If overallocation was disabled, ensure that it was the last
6816 write. Otherwise, we missed an optimization */
6817 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006818 }
6819 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006820
Victor Stinner50149202015-09-22 00:26:54 +02006821 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006823 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006824
6825 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006826 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006827 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006828 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006829 Py_XDECREF(exc);
6830 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831}
6832
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006833/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834PyObject *
6835PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006836 Py_ssize_t size,
6837 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 PyObject *result;
6840 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6841 if (unicode == NULL)
6842 return NULL;
6843 result = unicode_encode_ucs1(unicode, errors, 256);
6844 Py_DECREF(unicode);
6845 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846}
6847
Alexander Belopolsky40018472011-02-26 01:02:56 +00006848PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006849_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850{
6851 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 PyErr_BadArgument();
6853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006855 if (PyUnicode_READY(unicode) == -1)
6856 return NULL;
6857 /* Fast path: if it is a one-byte string, construct
6858 bytes object directly. */
6859 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6860 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6861 PyUnicode_GET_LENGTH(unicode));
6862 /* Non-Latin-1 characters present. Defer to above function to
6863 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006864 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006865}
6866
6867PyObject*
6868PyUnicode_AsLatin1String(PyObject *unicode)
6869{
6870 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871}
6872
6873/* --- 7-bit ASCII Codec -------------------------------------------------- */
6874
Alexander Belopolsky40018472011-02-26 01:02:56 +00006875PyObject *
6876PyUnicode_DecodeASCII(const char *s,
6877 Py_ssize_t size,
6878 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006881 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006882 int kind;
6883 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006884 Py_ssize_t startinpos;
6885 Py_ssize_t endinpos;
6886 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006888 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006889 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006890 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006893 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006894
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006896 if (size == 1 && (unsigned char)s[0] < 128)
6897 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006898
Victor Stinner8f674cc2013-04-17 23:02:17 +02006899 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006900 writer.min_length = size;
6901 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006902 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006903
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006905 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006906 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006907 writer.pos = outpos;
6908 if (writer.pos == size)
6909 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006910
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006911 s += writer.pos;
6912 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006914 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006916 PyUnicode_WRITE(kind, data, writer.pos, c);
6917 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006919 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006921
6922 /* byte outsize range 0x00..0x7f: call the error handler */
6923
6924 if (error_handler == _Py_ERROR_UNKNOWN)
6925 error_handler = get_error_handler(errors);
6926
6927 switch (error_handler)
6928 {
6929 case _Py_ERROR_REPLACE:
6930 case _Py_ERROR_SURROGATEESCAPE:
6931 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006932 but we may switch to UCS2 at the first write */
6933 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6934 goto onError;
6935 kind = writer.kind;
6936 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006937
6938 if (error_handler == _Py_ERROR_REPLACE)
6939 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6940 else
6941 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6942 writer.pos++;
6943 ++s;
6944 break;
6945
6946 case _Py_ERROR_IGNORE:
6947 ++s;
6948 break;
6949
6950 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 startinpos = s-starts;
6952 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006953 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02006954 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 "ascii", "ordinal not in range(128)",
6956 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006957 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006959 kind = writer.kind;
6960 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006963 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006965 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006966
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006968 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 return NULL;
6972}
6973
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006974/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006975PyObject *
6976PyUnicode_EncodeASCII(const Py_UNICODE *p,
6977 Py_ssize_t size,
6978 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006980 PyObject *result;
6981 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6982 if (unicode == NULL)
6983 return NULL;
6984 result = unicode_encode_ucs1(unicode, errors, 128);
6985 Py_DECREF(unicode);
6986 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
Alexander Belopolsky40018472011-02-26 01:02:56 +00006989PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006990_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991{
6992 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 PyErr_BadArgument();
6994 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006996 if (PyUnicode_READY(unicode) == -1)
6997 return NULL;
6998 /* Fast path: if it is an ASCII-only string, construct bytes object
6999 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007000 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007001 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7002 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007003 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007004}
7005
7006PyObject *
7007PyUnicode_AsASCIIString(PyObject *unicode)
7008{
7009 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010}
7011
Victor Stinner99b95382011-07-04 14:23:54 +02007012#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007013
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007014/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007015
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007016#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017#define NEED_RETRY
7018#endif
7019
Victor Stinner3a50e702011-10-18 21:21:00 +02007020#ifndef WC_ERR_INVALID_CHARS
7021# define WC_ERR_INVALID_CHARS 0x0080
7022#endif
7023
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007024static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007025code_page_name(UINT code_page, PyObject **obj)
7026{
7027 *obj = NULL;
7028 if (code_page == CP_ACP)
7029 return "mbcs";
7030 if (code_page == CP_UTF7)
7031 return "CP_UTF7";
7032 if (code_page == CP_UTF8)
7033 return "CP_UTF8";
7034
7035 *obj = PyBytes_FromFormat("cp%u", code_page);
7036 if (*obj == NULL)
7037 return NULL;
7038 return PyBytes_AS_STRING(*obj);
7039}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040
Victor Stinner3a50e702011-10-18 21:21:00 +02007041static DWORD
7042decode_code_page_flags(UINT code_page)
7043{
7044 if (code_page == CP_UTF7) {
7045 /* The CP_UTF7 decoder only supports flags=0 */
7046 return 0;
7047 }
7048 else
7049 return MB_ERR_INVALID_CHARS;
7050}
7051
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 * Decode a byte string from a Windows code page into unicode object in strict
7054 * mode.
7055 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007056 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7057 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007059static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007060decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007062 const char *in,
7063 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064{
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007066 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068
7069 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 assert(insize > 0);
7071 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7072 if (outsize <= 0)
7073 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074
7075 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007076 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007077 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007078 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 if (*v == NULL)
7080 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082 }
7083 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007086 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089 }
7090
7091 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7093 if (outsize <= 0)
7094 goto error;
7095 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007096
Victor Stinner3a50e702011-10-18 21:21:00 +02007097error:
7098 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7099 return -2;
7100 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007101 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102}
7103
Victor Stinner3a50e702011-10-18 21:21:00 +02007104/*
7105 * Decode a byte string from a code page into unicode object with an error
7106 * handler.
7107 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007108 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 * UnicodeDecodeError exception and returns -1 on error.
7110 */
7111static int
7112decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007113 PyObject **v,
7114 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007115 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007116{
7117 const char *startin = in;
7118 const char *endin = in + size;
7119 const DWORD flags = decode_code_page_flags(code_page);
7120 /* Ideally, we should get reason from FormatMessage. This is the Windows
7121 2000 English version of the message. */
7122 const char *reason = "No mapping for the Unicode character exists "
7123 "in the target code page.";
7124 /* each step cannot decode more than 1 character, but a character can be
7125 represented as a surrogate pair */
7126 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007127 int insize;
7128 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 PyObject *errorHandler = NULL;
7130 PyObject *exc = NULL;
7131 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007132 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 DWORD err;
7134 int ret = -1;
7135
7136 assert(size > 0);
7137
7138 encoding = code_page_name(code_page, &encoding_obj);
7139 if (encoding == NULL)
7140 return -1;
7141
Victor Stinner7d00cc12014-03-17 23:08:06 +01007142 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7144 UnicodeDecodeError. */
7145 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7146 if (exc != NULL) {
7147 PyCodec_StrictErrors(exc);
7148 Py_CLEAR(exc);
7149 }
7150 goto error;
7151 }
7152
7153 if (*v == NULL) {
7154 /* Create unicode object */
7155 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7156 PyErr_NoMemory();
7157 goto error;
7158 }
Victor Stinnerab595942011-12-17 04:59:06 +01007159 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007160 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 if (*v == NULL)
7162 goto error;
7163 startout = PyUnicode_AS_UNICODE(*v);
7164 }
7165 else {
7166 /* Extend unicode object */
7167 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7168 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7169 PyErr_NoMemory();
7170 goto error;
7171 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007172 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 goto error;
7174 startout = PyUnicode_AS_UNICODE(*v) + n;
7175 }
7176
7177 /* Decode the byte string character per character */
7178 out = startout;
7179 while (in < endin)
7180 {
7181 /* Decode a character */
7182 insize = 1;
7183 do
7184 {
7185 outsize = MultiByteToWideChar(code_page, flags,
7186 in, insize,
7187 buffer, Py_ARRAY_LENGTH(buffer));
7188 if (outsize > 0)
7189 break;
7190 err = GetLastError();
7191 if (err != ERROR_NO_UNICODE_TRANSLATION
7192 && err != ERROR_INSUFFICIENT_BUFFER)
7193 {
7194 PyErr_SetFromWindowsErr(0);
7195 goto error;
7196 }
7197 insize++;
7198 }
7199 /* 4=maximum length of a UTF-8 sequence */
7200 while (insize <= 4 && (in + insize) <= endin);
7201
7202 if (outsize <= 0) {
7203 Py_ssize_t startinpos, endinpos, outpos;
7204
Victor Stinner7d00cc12014-03-17 23:08:06 +01007205 /* last character in partial decode? */
7206 if (in + insize >= endin && !final)
7207 break;
7208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 startinpos = in - startin;
7210 endinpos = startinpos + 1;
7211 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007212 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 errors, &errorHandler,
7214 encoding, reason,
7215 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007216 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 {
7218 goto error;
7219 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007220 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 }
7222 else {
7223 in += insize;
7224 memcpy(out, buffer, outsize * sizeof(wchar_t));
7225 out += outsize;
7226 }
7227 }
7228
7229 /* write a NUL character at the end */
7230 *out = 0;
7231
7232 /* Extend unicode object */
7233 outsize = out - startout;
7234 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007235 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007237 /* (in - startin) <= size and size is an int */
7238 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007239
7240error:
7241 Py_XDECREF(encoding_obj);
7242 Py_XDECREF(errorHandler);
7243 Py_XDECREF(exc);
7244 return ret;
7245}
7246
Victor Stinner3a50e702011-10-18 21:21:00 +02007247static PyObject *
7248decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007249 const char *s, Py_ssize_t size,
7250 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007251{
Victor Stinner76a31a62011-11-04 00:05:13 +01007252 PyObject *v = NULL;
7253 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007254
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 if (code_page < 0) {
7256 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7257 return NULL;
7258 }
7259
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007260 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262
Victor Stinner76a31a62011-11-04 00:05:13 +01007263 do
7264 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007266 if (size > INT_MAX) {
7267 chunk_size = INT_MAX;
7268 final = 0;
7269 done = 0;
7270 }
7271 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007273 {
7274 chunk_size = (int)size;
7275 final = (consumed == NULL);
7276 done = 1;
7277 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278
Victor Stinner76a31a62011-11-04 00:05:13 +01007279 if (chunk_size == 0 && done) {
7280 if (v != NULL)
7281 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007282 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284
Victor Stinner76a31a62011-11-04 00:05:13 +01007285 converted = decode_code_page_strict(code_page, &v,
7286 s, chunk_size);
7287 if (converted == -2)
7288 converted = decode_code_page_errors(code_page, &v,
7289 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007290 errors, final);
7291 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007292
7293 if (converted < 0) {
7294 Py_XDECREF(v);
7295 return NULL;
7296 }
7297
7298 if (consumed)
7299 *consumed += converted;
7300
7301 s += converted;
7302 size -= converted;
7303 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007304
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007305 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306}
7307
Alexander Belopolsky40018472011-02-26 01:02:56 +00007308PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007309PyUnicode_DecodeCodePageStateful(int code_page,
7310 const char *s,
7311 Py_ssize_t size,
7312 const char *errors,
7313 Py_ssize_t *consumed)
7314{
7315 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7316}
7317
7318PyObject *
7319PyUnicode_DecodeMBCSStateful(const char *s,
7320 Py_ssize_t size,
7321 const char *errors,
7322 Py_ssize_t *consumed)
7323{
7324 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7325}
7326
7327PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007328PyUnicode_DecodeMBCS(const char *s,
7329 Py_ssize_t size,
7330 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007331{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7333}
7334
Victor Stinner3a50e702011-10-18 21:21:00 +02007335static DWORD
7336encode_code_page_flags(UINT code_page, const char *errors)
7337{
7338 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007339 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 }
7341 else if (code_page == CP_UTF7) {
7342 /* CP_UTF7 only supports flags=0 */
7343 return 0;
7344 }
7345 else {
7346 if (errors != NULL && strcmp(errors, "replace") == 0)
7347 return 0;
7348 else
7349 return WC_NO_BEST_FIT_CHARS;
7350 }
7351}
7352
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 * Encode a Unicode string to a Windows code page into a byte string in strict
7355 * mode.
7356 *
7357 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007358 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007360static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007361encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007362 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364{
Victor Stinner554f3f02010-06-16 23:33:54 +00007365 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 BOOL *pusedDefaultChar = &usedDefaultChar;
7367 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007368 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007369 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 const DWORD flags = encode_code_page_flags(code_page, NULL);
7371 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007372 /* Create a substring so that we can get the UTF-16 representation
7373 of just the slice under consideration. */
7374 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375
Martin v. Löwis3d325192011-11-04 18:23:06 +01007376 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007377
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007379 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007381 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007382
Victor Stinner2fc507f2011-11-04 20:06:39 +01007383 substring = PyUnicode_Substring(unicode, offset, offset+len);
7384 if (substring == NULL)
7385 return -1;
7386 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7387 if (p == NULL) {
7388 Py_DECREF(substring);
7389 return -1;
7390 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007391 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007392
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007393 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007395 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 NULL, 0,
7397 NULL, pusedDefaultChar);
7398 if (outsize <= 0)
7399 goto error;
7400 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007401 if (pusedDefaultChar && *pusedDefaultChar) {
7402 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007404 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007405
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007409 if (*outbytes == NULL) {
7410 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007412 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007414 }
7415 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const Py_ssize_t n = PyBytes_Size(*outbytes);
7418 if (outsize > PY_SSIZE_T_MAX - n) {
7419 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7424 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007428 }
7429
7430 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007432 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 out, outsize,
7434 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 if (outsize <= 0)
7437 goto error;
7438 if (pusedDefaultChar && *pusedDefaultChar)
7439 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007443 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7445 return -2;
7446 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007447 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007448}
7449
Victor Stinner3a50e702011-10-18 21:21:00 +02007450/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007451 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 * error handler.
7453 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007454 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 * -1 on other error.
7456 */
7457static int
7458encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007459 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007461{
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 Py_ssize_t pos = unicode_offset;
7464 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 /* Ideally, we should get reason from FormatMessage. This is the Windows
7466 2000 English version of the message. */
7467 const char *reason = "invalid character";
7468 /* 4=maximum length of a UTF-8 sequence */
7469 char buffer[4];
7470 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7471 Py_ssize_t outsize;
7472 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 PyObject *errorHandler = NULL;
7474 PyObject *exc = NULL;
7475 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007476 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007477 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 PyObject *rep;
7479 int ret = -1;
7480
7481 assert(insize > 0);
7482
7483 encoding = code_page_name(code_page, &encoding_obj);
7484 if (encoding == NULL)
7485 return -1;
7486
7487 if (errors == NULL || strcmp(errors, "strict") == 0) {
7488 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7489 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007490 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 if (exc != NULL) {
7492 PyCodec_StrictErrors(exc);
7493 Py_DECREF(exc);
7494 }
7495 Py_XDECREF(encoding_obj);
7496 return -1;
7497 }
7498
7499 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7500 pusedDefaultChar = &usedDefaultChar;
7501 else
7502 pusedDefaultChar = NULL;
7503
7504 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7505 PyErr_NoMemory();
7506 goto error;
7507 }
7508 outsize = insize * Py_ARRAY_LENGTH(buffer);
7509
7510 if (*outbytes == NULL) {
7511 /* Create string object */
7512 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7513 if (*outbytes == NULL)
7514 goto error;
7515 out = PyBytes_AS_STRING(*outbytes);
7516 }
7517 else {
7518 /* Extend string object */
7519 Py_ssize_t n = PyBytes_Size(*outbytes);
7520 if (n > PY_SSIZE_T_MAX - outsize) {
7521 PyErr_NoMemory();
7522 goto error;
7523 }
7524 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7525 goto error;
7526 out = PyBytes_AS_STRING(*outbytes) + n;
7527 }
7528
7529 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007530 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007532 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7533 wchar_t chars[2];
7534 int charsize;
7535 if (ch < 0x10000) {
7536 chars[0] = (wchar_t)ch;
7537 charsize = 1;
7538 }
7539 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007540 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7541 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007542 charsize = 2;
7543 }
7544
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007546 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 buffer, Py_ARRAY_LENGTH(buffer),
7548 NULL, pusedDefaultChar);
7549 if (outsize > 0) {
7550 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7551 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007552 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 memcpy(out, buffer, outsize);
7554 out += outsize;
7555 continue;
7556 }
7557 }
7558 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7559 PyErr_SetFromWindowsErr(0);
7560 goto error;
7561 }
7562
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 rep = unicode_encode_call_errorhandler(
7564 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007565 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 if (rep == NULL)
7568 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007569 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007570
7571 if (PyBytes_Check(rep)) {
7572 outsize = PyBytes_GET_SIZE(rep);
7573 if (outsize != 1) {
7574 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7575 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7576 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7577 Py_DECREF(rep);
7578 goto error;
7579 }
7580 out = PyBytes_AS_STRING(*outbytes) + offset;
7581 }
7582 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7583 out += outsize;
7584 }
7585 else {
7586 Py_ssize_t i;
7587 enum PyUnicode_Kind kind;
7588 void *data;
7589
Benjamin Petersonbac79492012-01-14 13:34:47 -05007590 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 Py_DECREF(rep);
7592 goto error;
7593 }
7594
7595 outsize = PyUnicode_GET_LENGTH(rep);
7596 if (outsize != 1) {
7597 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7598 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7599 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7600 Py_DECREF(rep);
7601 goto error;
7602 }
7603 out = PyBytes_AS_STRING(*outbytes) + offset;
7604 }
7605 kind = PyUnicode_KIND(rep);
7606 data = PyUnicode_DATA(rep);
7607 for (i=0; i < outsize; i++) {
7608 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7609 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007610 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007611 encoding, unicode,
7612 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 "unable to encode error handler result to ASCII");
7614 Py_DECREF(rep);
7615 goto error;
7616 }
7617 *out = (unsigned char)ch;
7618 out++;
7619 }
7620 }
7621 Py_DECREF(rep);
7622 }
7623 /* write a NUL byte */
7624 *out = 0;
7625 outsize = out - PyBytes_AS_STRING(*outbytes);
7626 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7627 if (_PyBytes_Resize(outbytes, outsize) < 0)
7628 goto error;
7629 ret = 0;
7630
7631error:
7632 Py_XDECREF(encoding_obj);
7633 Py_XDECREF(errorHandler);
7634 Py_XDECREF(exc);
7635 return ret;
7636}
7637
Victor Stinner3a50e702011-10-18 21:21:00 +02007638static PyObject *
7639encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007640 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 const char *errors)
7642{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007643 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007645 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007646 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007647
Victor Stinner29dacf22015-01-26 16:41:32 +01007648 if (!PyUnicode_Check(unicode)) {
7649 PyErr_BadArgument();
7650 return NULL;
7651 }
7652
Benjamin Petersonbac79492012-01-14 13:34:47 -05007653 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007654 return NULL;
7655 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007656
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 if (code_page < 0) {
7658 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7659 return NULL;
7660 }
7661
Martin v. Löwis3d325192011-11-04 18:23:06 +01007662 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007663 return PyBytes_FromStringAndSize(NULL, 0);
7664
Victor Stinner7581cef2011-11-03 22:32:33 +01007665 offset = 0;
7666 do
7667 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007668#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007669 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007670 chunks. */
7671 if (len > INT_MAX/2) {
7672 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007673 done = 0;
7674 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007675 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007676#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007677 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007678 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007679 done = 1;
7680 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007681
Victor Stinner76a31a62011-11-04 00:05:13 +01007682 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007683 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 errors);
7685 if (ret == -2)
7686 ret = encode_code_page_errors(code_page, &outbytes,
7687 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007689 if (ret < 0) {
7690 Py_XDECREF(outbytes);
7691 return NULL;
7692 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007693
Victor Stinner7581cef2011-11-03 22:32:33 +01007694 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007695 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007696 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007697
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 return outbytes;
7699}
7700
7701PyObject *
7702PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7703 Py_ssize_t size,
7704 const char *errors)
7705{
Victor Stinner7581cef2011-11-03 22:32:33 +01007706 PyObject *unicode, *res;
7707 unicode = PyUnicode_FromUnicode(p, size);
7708 if (unicode == NULL)
7709 return NULL;
7710 res = encode_code_page(CP_ACP, unicode, errors);
7711 Py_DECREF(unicode);
7712 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007713}
7714
7715PyObject *
7716PyUnicode_EncodeCodePage(int code_page,
7717 PyObject *unicode,
7718 const char *errors)
7719{
Victor Stinner7581cef2011-11-03 22:32:33 +01007720 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007721}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007722
Alexander Belopolsky40018472011-02-26 01:02:56 +00007723PyObject *
7724PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007725{
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007727}
7728
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729#undef NEED_RETRY
7730
Victor Stinner99b95382011-07-04 14:23:54 +02007731#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007732
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733/* --- Character Mapping Codec -------------------------------------------- */
7734
Victor Stinnerfb161b12013-04-18 01:44:27 +02007735static int
7736charmap_decode_string(const char *s,
7737 Py_ssize_t size,
7738 PyObject *mapping,
7739 const char *errors,
7740 _PyUnicodeWriter *writer)
7741{
7742 const char *starts = s;
7743 const char *e;
7744 Py_ssize_t startinpos, endinpos;
7745 PyObject *errorHandler = NULL, *exc = NULL;
7746 Py_ssize_t maplen;
7747 enum PyUnicode_Kind mapkind;
7748 void *mapdata;
7749 Py_UCS4 x;
7750 unsigned char ch;
7751
7752 if (PyUnicode_READY(mapping) == -1)
7753 return -1;
7754
7755 maplen = PyUnicode_GET_LENGTH(mapping);
7756 mapdata = PyUnicode_DATA(mapping);
7757 mapkind = PyUnicode_KIND(mapping);
7758
7759 e = s + size;
7760
7761 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7762 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7763 * is disabled in encoding aliases, latin1 is preferred because
7764 * its implementation is faster. */
7765 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7766 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7767 Py_UCS4 maxchar = writer->maxchar;
7768
7769 assert (writer->kind == PyUnicode_1BYTE_KIND);
7770 while (s < e) {
7771 ch = *s;
7772 x = mapdata_ucs1[ch];
7773 if (x > maxchar) {
7774 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7775 goto onError;
7776 maxchar = writer->maxchar;
7777 outdata = (Py_UCS1 *)writer->data;
7778 }
7779 outdata[writer->pos] = x;
7780 writer->pos++;
7781 ++s;
7782 }
7783 return 0;
7784 }
7785
7786 while (s < e) {
7787 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7788 enum PyUnicode_Kind outkind = writer->kind;
7789 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7790 if (outkind == PyUnicode_1BYTE_KIND) {
7791 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7792 Py_UCS4 maxchar = writer->maxchar;
7793 while (s < e) {
7794 ch = *s;
7795 x = mapdata_ucs2[ch];
7796 if (x > maxchar)
7797 goto Error;
7798 outdata[writer->pos] = x;
7799 writer->pos++;
7800 ++s;
7801 }
7802 break;
7803 }
7804 else if (outkind == PyUnicode_2BYTE_KIND) {
7805 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7806 while (s < e) {
7807 ch = *s;
7808 x = mapdata_ucs2[ch];
7809 if (x == 0xFFFE)
7810 goto Error;
7811 outdata[writer->pos] = x;
7812 writer->pos++;
7813 ++s;
7814 }
7815 break;
7816 }
7817 }
7818 ch = *s;
7819
7820 if (ch < maplen)
7821 x = PyUnicode_READ(mapkind, mapdata, ch);
7822 else
7823 x = 0xfffe; /* invalid value */
7824Error:
7825 if (x == 0xfffe)
7826 {
7827 /* undefined mapping */
7828 startinpos = s-starts;
7829 endinpos = startinpos+1;
7830 if (unicode_decode_call_errorhandler_writer(
7831 errors, &errorHandler,
7832 "charmap", "character maps to <undefined>",
7833 &starts, &e, &startinpos, &endinpos, &exc, &s,
7834 writer)) {
7835 goto onError;
7836 }
7837 continue;
7838 }
7839
7840 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7841 goto onError;
7842 ++s;
7843 }
7844 Py_XDECREF(errorHandler);
7845 Py_XDECREF(exc);
7846 return 0;
7847
7848onError:
7849 Py_XDECREF(errorHandler);
7850 Py_XDECREF(exc);
7851 return -1;
7852}
7853
7854static int
7855charmap_decode_mapping(const char *s,
7856 Py_ssize_t size,
7857 PyObject *mapping,
7858 const char *errors,
7859 _PyUnicodeWriter *writer)
7860{
7861 const char *starts = s;
7862 const char *e;
7863 Py_ssize_t startinpos, endinpos;
7864 PyObject *errorHandler = NULL, *exc = NULL;
7865 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007866 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007867
7868 e = s + size;
7869
7870 while (s < e) {
7871 ch = *s;
7872
7873 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7874 key = PyLong_FromLong((long)ch);
7875 if (key == NULL)
7876 goto onError;
7877
7878 item = PyObject_GetItem(mapping, key);
7879 Py_DECREF(key);
7880 if (item == NULL) {
7881 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7882 /* No mapping found means: mapping is undefined. */
7883 PyErr_Clear();
7884 goto Undefined;
7885 } else
7886 goto onError;
7887 }
7888
7889 /* Apply mapping */
7890 if (item == Py_None)
7891 goto Undefined;
7892 if (PyLong_Check(item)) {
7893 long value = PyLong_AS_LONG(item);
7894 if (value == 0xFFFE)
7895 goto Undefined;
7896 if (value < 0 || value > MAX_UNICODE) {
7897 PyErr_Format(PyExc_TypeError,
7898 "character mapping must be in range(0x%lx)",
7899 (unsigned long)MAX_UNICODE + 1);
7900 goto onError;
7901 }
7902
7903 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7904 goto onError;
7905 }
7906 else if (PyUnicode_Check(item)) {
7907 if (PyUnicode_READY(item) == -1)
7908 goto onError;
7909 if (PyUnicode_GET_LENGTH(item) == 1) {
7910 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7911 if (value == 0xFFFE)
7912 goto Undefined;
7913 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7914 goto onError;
7915 }
7916 else {
7917 writer->overallocate = 1;
7918 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7919 goto onError;
7920 }
7921 }
7922 else {
7923 /* wrong return value */
7924 PyErr_SetString(PyExc_TypeError,
7925 "character mapping must return integer, None or str");
7926 goto onError;
7927 }
7928 Py_CLEAR(item);
7929 ++s;
7930 continue;
7931
7932Undefined:
7933 /* undefined mapping */
7934 Py_CLEAR(item);
7935 startinpos = s-starts;
7936 endinpos = startinpos+1;
7937 if (unicode_decode_call_errorhandler_writer(
7938 errors, &errorHandler,
7939 "charmap", "character maps to <undefined>",
7940 &starts, &e, &startinpos, &endinpos, &exc, &s,
7941 writer)) {
7942 goto onError;
7943 }
7944 }
7945 Py_XDECREF(errorHandler);
7946 Py_XDECREF(exc);
7947 return 0;
7948
7949onError:
7950 Py_XDECREF(item);
7951 Py_XDECREF(errorHandler);
7952 Py_XDECREF(exc);
7953 return -1;
7954}
7955
Alexander Belopolsky40018472011-02-26 01:02:56 +00007956PyObject *
7957PyUnicode_DecodeCharmap(const char *s,
7958 Py_ssize_t size,
7959 PyObject *mapping,
7960 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007962 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007963
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 /* Default to Latin-1 */
7965 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007969 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007970 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007971 writer.min_length = size;
7972 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007974
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007975 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007976 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7977 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007978 }
7979 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007980 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7981 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007983 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007984
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007986 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 return NULL;
7988}
7989
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990/* Charmap encoding: the lookup table */
7991
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 PyObject_HEAD
7994 unsigned char level1[32];
7995 int count2, count3;
7996 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007997};
7998
7999static PyObject*
8000encoding_map_size(PyObject *obj, PyObject* args)
8001{
8002 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005}
8006
8007static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 PyDoc_STR("Return the size (in bytes) of this object") },
8010 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011};
8012
8013static void
8014encoding_map_dealloc(PyObject* o)
8015{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017}
8018
8019static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 "EncodingMap", /*tp_name*/
8022 sizeof(struct encoding_map), /*tp_basicsize*/
8023 0, /*tp_itemsize*/
8024 /* methods */
8025 encoding_map_dealloc, /*tp_dealloc*/
8026 0, /*tp_print*/
8027 0, /*tp_getattr*/
8028 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008029 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 0, /*tp_repr*/
8031 0, /*tp_as_number*/
8032 0, /*tp_as_sequence*/
8033 0, /*tp_as_mapping*/
8034 0, /*tp_hash*/
8035 0, /*tp_call*/
8036 0, /*tp_str*/
8037 0, /*tp_getattro*/
8038 0, /*tp_setattro*/
8039 0, /*tp_as_buffer*/
8040 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8041 0, /*tp_doc*/
8042 0, /*tp_traverse*/
8043 0, /*tp_clear*/
8044 0, /*tp_richcompare*/
8045 0, /*tp_weaklistoffset*/
8046 0, /*tp_iter*/
8047 0, /*tp_iternext*/
8048 encoding_map_methods, /*tp_methods*/
8049 0, /*tp_members*/
8050 0, /*tp_getset*/
8051 0, /*tp_base*/
8052 0, /*tp_dict*/
8053 0, /*tp_descr_get*/
8054 0, /*tp_descr_set*/
8055 0, /*tp_dictoffset*/
8056 0, /*tp_init*/
8057 0, /*tp_alloc*/
8058 0, /*tp_new*/
8059 0, /*tp_free*/
8060 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061};
8062
8063PyObject*
8064PyUnicode_BuildEncodingMap(PyObject* string)
8065{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 PyObject *result;
8067 struct encoding_map *mresult;
8068 int i;
8069 int need_dict = 0;
8070 unsigned char level1[32];
8071 unsigned char level2[512];
8072 unsigned char *mlevel1, *mlevel2, *mlevel3;
8073 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 int kind;
8075 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008076 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008079 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 PyErr_BadArgument();
8081 return NULL;
8082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083 kind = PyUnicode_KIND(string);
8084 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008085 length = PyUnicode_GET_LENGTH(string);
8086 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087 memset(level1, 0xFF, sizeof level1);
8088 memset(level2, 0xFF, sizeof level2);
8089
8090 /* If there isn't a one-to-one mapping of NULL to \0,
8091 or if there are non-BMP characters, we need to use
8092 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008095 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 ch = PyUnicode_READ(kind, data, i);
8098 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099 need_dict = 1;
8100 break;
8101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008102 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103 /* unmapped character */
8104 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 l1 = ch >> 11;
8106 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 if (level1[l1] == 0xFF)
8108 level1[l1] = count2++;
8109 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111 }
8112
8113 if (count2 >= 0xFF || count3 >= 0xFF)
8114 need_dict = 1;
8115
8116 if (need_dict) {
8117 PyObject *result = PyDict_New();
8118 PyObject *key, *value;
8119 if (!result)
8120 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008121 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008122 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008123 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 if (!key || !value)
8125 goto failed1;
8126 if (PyDict_SetItem(result, key, value) == -1)
8127 goto failed1;
8128 Py_DECREF(key);
8129 Py_DECREF(value);
8130 }
8131 return result;
8132 failed1:
8133 Py_XDECREF(key);
8134 Py_XDECREF(value);
8135 Py_DECREF(result);
8136 return NULL;
8137 }
8138
8139 /* Create a three-level trie */
8140 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8141 16*count2 + 128*count3 - 1);
8142 if (!result)
8143 return PyErr_NoMemory();
8144 PyObject_Init(result, &EncodingMapType);
8145 mresult = (struct encoding_map*)result;
8146 mresult->count2 = count2;
8147 mresult->count3 = count3;
8148 mlevel1 = mresult->level1;
8149 mlevel2 = mresult->level23;
8150 mlevel3 = mresult->level23 + 16*count2;
8151 memcpy(mlevel1, level1, 32);
8152 memset(mlevel2, 0xFF, 16*count2);
8153 memset(mlevel3, 0, 128*count3);
8154 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008155 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008157 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8158 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 /* unmapped character */
8160 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008161 o1 = ch>>11;
8162 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 i2 = 16*mlevel1[o1] + o2;
8164 if (mlevel2[i2] == 0xFF)
8165 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008166 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 i3 = 128*mlevel2[i2] + o3;
8168 mlevel3[i3] = i;
8169 }
8170 return result;
8171}
8172
8173static int
Victor Stinner22168992011-11-20 17:09:18 +01008174encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008175{
8176 struct encoding_map *map = (struct encoding_map*)mapping;
8177 int l1 = c>>11;
8178 int l2 = (c>>7) & 0xF;
8179 int l3 = c & 0x7F;
8180 int i;
8181
Victor Stinner22168992011-11-20 17:09:18 +01008182 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184 if (c == 0)
8185 return 0;
8186 /* level 1*/
8187 i = map->level1[l1];
8188 if (i == 0xFF) {
8189 return -1;
8190 }
8191 /* level 2*/
8192 i = map->level23[16*i+l2];
8193 if (i == 0xFF) {
8194 return -1;
8195 }
8196 /* level 3 */
8197 i = map->level23[16*map->count2 + 128*i + l3];
8198 if (i == 0) {
8199 return -1;
8200 }
8201 return i;
8202}
8203
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204/* Lookup the character ch in the mapping. If the character
8205 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008206 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008207static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008208charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209{
Christian Heimes217cfd12007-12-02 14:31:20 +00008210 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 PyObject *x;
8212
8213 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215 x = PyObject_GetItem(mapping, w);
8216 Py_DECREF(w);
8217 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8219 /* No mapping found means: mapping is undefined. */
8220 PyErr_Clear();
8221 x = Py_None;
8222 Py_INCREF(x);
8223 return x;
8224 } else
8225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008227 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008229 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 long value = PyLong_AS_LONG(x);
8231 if (value < 0 || value > 255) {
8232 PyErr_SetString(PyExc_TypeError,
8233 "character mapping must be in range(256)");
8234 Py_DECREF(x);
8235 return NULL;
8236 }
8237 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008239 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 /* wrong return value */
8243 PyErr_Format(PyExc_TypeError,
8244 "character mapping must return integer, bytes or None, not %.400s",
8245 x->ob_type->tp_name);
8246 Py_DECREF(x);
8247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 }
8249}
8250
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008251static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008252charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8255 /* exponentially overallocate to minimize reallocations */
8256 if (requiredsize < 2*outsize)
8257 requiredsize = 2*outsize;
8258 if (_PyBytes_Resize(outobj, requiredsize))
8259 return -1;
8260 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261}
8262
Benjamin Peterson14339b62009-01-31 16:36:08 +00008263typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008265} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008267 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 space is available. Return a new reference to the object that
8269 was put in the output buffer, or Py_None, if the mapping was undefined
8270 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008271 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008272static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008273charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008274 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276 PyObject *rep;
8277 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008278 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279
Christian Heimes90aa7642007-12-19 02:45:37 +00008280 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008283 if (res == -1)
8284 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 if (outsize<requiredsize)
8286 if (charmapencode_resize(outobj, outpos, requiredsize))
8287 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008288 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 outstart[(*outpos)++] = (char)res;
8290 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008291 }
8292
8293 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008296 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 Py_DECREF(rep);
8298 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 if (PyLong_Check(rep)) {
8301 Py_ssize_t requiredsize = *outpos+1;
8302 if (outsize<requiredsize)
8303 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8304 Py_DECREF(rep);
8305 return enc_EXCEPTION;
8306 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008307 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008309 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 else {
8311 const char *repchars = PyBytes_AS_STRING(rep);
8312 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8313 Py_ssize_t requiredsize = *outpos+repsize;
8314 if (outsize<requiredsize)
8315 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8316 Py_DECREF(rep);
8317 return enc_EXCEPTION;
8318 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008319 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 memcpy(outstart + *outpos, repchars, repsize);
8321 *outpos += repsize;
8322 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 Py_DECREF(rep);
8325 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326}
8327
8328/* handle an error in PyUnicode_EncodeCharmap
8329 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330static int
8331charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008332 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008334 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008335 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336{
8337 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008338 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008339 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008340 enum PyUnicode_Kind kind;
8341 void *data;
8342 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008344 Py_ssize_t collstartpos = *inpos;
8345 Py_ssize_t collendpos = *inpos+1;
8346 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347 char *encoding = "charmap";
8348 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008350 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008351 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352
Benjamin Petersonbac79492012-01-14 13:34:47 -05008353 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354 return -1;
8355 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356 /* find all unencodable characters */
8357 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008359 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008360 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008361 val = encoding_map_lookup(ch, mapping);
8362 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 break;
8364 ++collendpos;
8365 continue;
8366 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008367
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008368 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8369 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 if (rep==NULL)
8371 return -1;
8372 else if (rep!=Py_None) {
8373 Py_DECREF(rep);
8374 break;
8375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 }
8379 /* cache callback name lookup
8380 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008381 if (*error_handler == _Py_ERROR_UNKNOWN)
8382 *error_handler = get_error_handler(errors);
8383
8384 switch (*error_handler) {
8385 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008386 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008387 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008388
8389 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 x = charmapencode_output('?', mapping, res, respos);
8392 if (x==enc_EXCEPTION) {
8393 return -1;
8394 }
8395 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008396 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return -1;
8398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008399 }
8400 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008401 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008402 *inpos = collendpos;
8403 break;
Victor Stinner50149202015-09-22 00:26:54 +02008404
8405 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 /* generate replacement (temporarily (mis)uses p) */
8407 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 char buffer[2+29+1+1];
8409 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008410 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 for (cp = buffer; *cp; ++cp) {
8412 x = charmapencode_output(*cp, mapping, res, respos);
8413 if (x==enc_EXCEPTION)
8414 return -1;
8415 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008416 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 return -1;
8418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419 }
8420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 *inpos = collendpos;
8422 break;
Victor Stinner50149202015-09-22 00:26:54 +02008423
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 default:
Victor Stinner50149202015-09-22 00:26:54 +02008425 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008426 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008430 if (PyBytes_Check(repunicode)) {
8431 /* Directly copy bytes result to output. */
8432 Py_ssize_t outsize = PyBytes_Size(*res);
8433 Py_ssize_t requiredsize;
8434 repsize = PyBytes_Size(repunicode);
8435 requiredsize = *respos + repsize;
8436 if (requiredsize > outsize)
8437 /* Make room for all additional bytes. */
8438 if (charmapencode_resize(res, respos, requiredsize)) {
8439 Py_DECREF(repunicode);
8440 return -1;
8441 }
8442 memcpy(PyBytes_AsString(*res) + *respos,
8443 PyBytes_AsString(repunicode), repsize);
8444 *respos += repsize;
8445 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008446 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008447 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008448 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008449 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008450 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008451 Py_DECREF(repunicode);
8452 return -1;
8453 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008454 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008455 data = PyUnicode_DATA(repunicode);
8456 kind = PyUnicode_KIND(repunicode);
8457 for (index = 0; index < repsize; index++) {
8458 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8459 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008461 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return -1;
8463 }
8464 else if (x==enc_FAILED) {
8465 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008466 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 return -1;
8468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 }
8470 *inpos = newpos;
8471 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 }
8473 return 0;
8474}
8475
Alexander Belopolsky40018472011-02-26 01:02:56 +00008476PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008477_PyUnicode_EncodeCharmap(PyObject *unicode,
8478 PyObject *mapping,
8479 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481 /* output object */
8482 PyObject *res = NULL;
8483 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008484 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008485 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008487 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008488 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008490 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008491 void *data;
8492 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493
Benjamin Petersonbac79492012-01-14 13:34:47 -05008494 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495 return NULL;
8496 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008497 data = PyUnicode_DATA(unicode);
8498 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008499
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 /* Default to Latin-1 */
8501 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504 /* allocate enough for a simple encoding without
8505 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008506 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 if (res == NULL)
8508 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008509 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008513 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008515 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if (x==enc_EXCEPTION) /* error */
8517 goto onError;
8518 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008521 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 &res, &respos)) {
8523 goto onError;
8524 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 else
8527 /* done with this character => adjust input position */
8528 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008532 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008533 if (_PyBytes_Resize(&res, respos) < 0)
8534 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008535
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008537 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 return res;
8539
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 Py_XDECREF(res);
8542 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008543 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544 return NULL;
8545}
8546
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547/* Deprecated */
8548PyObject *
8549PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8550 Py_ssize_t size,
8551 PyObject *mapping,
8552 const char *errors)
8553{
8554 PyObject *result;
8555 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8556 if (unicode == NULL)
8557 return NULL;
8558 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8559 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008560 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561}
8562
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563PyObject *
8564PyUnicode_AsCharmapString(PyObject *unicode,
8565 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566{
8567 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 PyErr_BadArgument();
8569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
8573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008575static void
8576make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008578 Py_ssize_t startpos, Py_ssize_t endpos,
8579 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 *exceptionObject = _PyUnicodeTranslateError_Create(
8583 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 }
8585 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8587 goto onError;
8588 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8589 goto onError;
8590 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8591 goto onError;
8592 return;
8593 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008594 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 }
8596}
8597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598/* error handling callback helper:
8599 build arguments, call the callback and check the arguments,
8600 put the result into newpos and return the replacement string, which
8601 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008602static PyObject *
8603unicode_translate_call_errorhandler(const char *errors,
8604 PyObject **errorHandler,
8605 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008607 Py_ssize_t startpos, Py_ssize_t endpos,
8608 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02008610 static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008612 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 PyObject *restuple;
8614 PyObject *resunicode;
8615
8616 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 }
8621
8622 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626
8627 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008632 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 Py_DECREF(restuple);
8634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 }
8636 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 &resunicode, &i_newpos)) {
8638 Py_DECREF(restuple);
8639 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008641 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008643 else
8644 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008646 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 Py_DECREF(restuple);
8648 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008649 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 Py_INCREF(resunicode);
8651 Py_DECREF(restuple);
8652 return resunicode;
8653}
8654
8655/* Lookup the character ch in the mapping and put the result in result,
8656 which must be decrefed by the caller.
8657 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660{
Christian Heimes217cfd12007-12-02 14:31:20 +00008661 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 PyObject *x;
8663
8664 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 x = PyObject_GetItem(mapping, w);
8667 Py_DECREF(w);
8668 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8670 /* No mapping found means: use 1:1 mapping. */
8671 PyErr_Clear();
8672 *result = NULL;
8673 return 0;
8674 } else
8675 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 *result = x;
8679 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008681 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008683 if (value < 0 || value > MAX_UNICODE) {
8684 PyErr_Format(PyExc_ValueError,
8685 "character mapping must be in range(0x%x)",
8686 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 Py_DECREF(x);
8688 return -1;
8689 }
8690 *result = x;
8691 return 0;
8692 }
8693 else if (PyUnicode_Check(x)) {
8694 *result = x;
8695 return 0;
8696 }
8697 else {
8698 /* wrong return value */
8699 PyErr_SetString(PyExc_TypeError,
8700 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 Py_DECREF(x);
8702 return -1;
8703 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704}
Victor Stinner1194ea02014-04-04 19:37:40 +02008705
8706/* lookup the character, write the result into the writer.
8707 Return 1 if the result was written into the writer, return 0 if the mapping
8708 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008709static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008710charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8711 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712{
Victor Stinner1194ea02014-04-04 19:37:40 +02008713 PyObject *item;
8714
8715 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008717
8718 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008720 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008723 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008725
8726 if (item == Py_None) {
8727 Py_DECREF(item);
8728 return 0;
8729 }
8730
8731 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008732 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8733 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8734 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008735 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8736 Py_DECREF(item);
8737 return -1;
8738 }
8739 Py_DECREF(item);
8740 return 1;
8741 }
8742
8743 if (!PyUnicode_Check(item)) {
8744 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008746 }
8747
8748 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8749 Py_DECREF(item);
8750 return -1;
8751 }
8752
8753 Py_DECREF(item);
8754 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755}
8756
Victor Stinner89a76ab2014-04-05 11:44:04 +02008757static int
8758unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8759 Py_UCS1 *translate)
8760{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008761 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008762 int ret = 0;
8763
Victor Stinner89a76ab2014-04-05 11:44:04 +02008764 if (charmaptranslate_lookup(ch, mapping, &item)) {
8765 return -1;
8766 }
8767
8768 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008769 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008770 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008771 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008772 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008773 /* not found => default to 1:1 mapping */
8774 translate[ch] = ch;
8775 return 1;
8776 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008777 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008778 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008779 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8780 used it */
8781 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008782 /* invalid character or character outside ASCII:
8783 skip the fast translate */
8784 goto exit;
8785 }
8786 translate[ch] = (Py_UCS1)replace;
8787 }
8788 else if (PyUnicode_Check(item)) {
8789 Py_UCS4 replace;
8790
8791 if (PyUnicode_READY(item) == -1) {
8792 Py_DECREF(item);
8793 return -1;
8794 }
8795 if (PyUnicode_GET_LENGTH(item) != 1)
8796 goto exit;
8797
8798 replace = PyUnicode_READ_CHAR(item, 0);
8799 if (replace > 127)
8800 goto exit;
8801 translate[ch] = (Py_UCS1)replace;
8802 }
8803 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008804 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008805 goto exit;
8806 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008807 ret = 1;
8808
Benjamin Peterson1365de72014-04-07 20:15:41 -04008809 exit:
8810 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008811 return ret;
8812}
8813
8814/* Fast path for ascii => ascii translation. Return 1 if the whole string
8815 was translated into writer, return 0 if the input string was partially
8816 translated into writer, raise an exception and return -1 on error. */
8817static int
8818unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008819 _PyUnicodeWriter *writer, int ignore,
8820 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008821{
Victor Stinner872b2912014-04-05 14:27:07 +02008822 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 Py_ssize_t len;
8824 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008825 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008826
Victor Stinner89a76ab2014-04-05 11:44:04 +02008827 len = PyUnicode_GET_LENGTH(input);
8828
Victor Stinner872b2912014-04-05 14:27:07 +02008829 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008830
8831 in = PyUnicode_1BYTE_DATA(input);
8832 end = in + len;
8833
8834 assert(PyUnicode_IS_ASCII(writer->buffer));
8835 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8836 out = PyUnicode_1BYTE_DATA(writer->buffer);
8837
Victor Stinner872b2912014-04-05 14:27:07 +02008838 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008839 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008840 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008841 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008842 int translate = unicode_fast_translate_lookup(mapping, ch,
8843 ascii_table);
8844 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008846 if (translate == 0)
8847 goto exit;
8848 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 }
Victor Stinner872b2912014-04-05 14:27:07 +02008850 if (ch2 == 0xfe) {
8851 if (ignore)
8852 continue;
8853 goto exit;
8854 }
8855 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008857 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 }
Victor Stinner872b2912014-04-05 14:27:07 +02008859 res = 1;
8860
8861exit:
8862 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008863 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008864 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865}
8866
Victor Stinner3222da22015-10-01 22:07:32 +02008867static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868_PyUnicode_TranslateCharmap(PyObject *input,
8869 PyObject *mapping,
8870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008873 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 Py_ssize_t size, i;
8875 int kind;
8876 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008877 _PyUnicodeWriter writer;
8878 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008879 char *reason = "character maps to <undefined>";
8880 PyObject *errorHandler = NULL;
8881 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008882 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008883 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008884
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 PyErr_BadArgument();
8887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 if (PyUnicode_READY(input) == -1)
8891 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008892 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 kind = PyUnicode_KIND(input);
8894 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008896 if (size == 0)
8897 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008899 /* allocate enough for a simple 1:1 translation without
8900 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008901 _PyUnicodeWriter_Init(&writer);
8902 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904
Victor Stinner872b2912014-04-05 14:27:07 +02008905 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8906
Victor Stinner33798672016-03-01 21:59:58 +01008907 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008909 if (PyUnicode_IS_ASCII(input)) {
8910 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8911 if (res < 0) {
8912 _PyUnicodeWriter_Dealloc(&writer);
8913 return NULL;
8914 }
8915 if (res == 1)
8916 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917 }
Victor Stinner33798672016-03-01 21:59:58 +01008918 else {
8919 i = 0;
8920 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008924 int translate;
8925 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8926 Py_ssize_t newpos;
8927 /* startpos for collecting untranslatable chars */
8928 Py_ssize_t collstart;
8929 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931
Victor Stinner1194ea02014-04-04 19:37:40 +02008932 ch = PyUnicode_READ(kind, data, i);
8933 translate = charmaptranslate_output(ch, mapping, &writer);
8934 if (translate < 0)
8935 goto onError;
8936
8937 if (translate != 0) {
8938 /* it worked => adjust input pointer */
8939 ++i;
8940 continue;
8941 }
8942
8943 /* untranslatable character */
8944 collstart = i;
8945 collend = i+1;
8946
8947 /* find all untranslatable characters */
8948 while (collend < size) {
8949 PyObject *x;
8950 ch = PyUnicode_READ(kind, data, collend);
8951 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008952 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 Py_XDECREF(x);
8954 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008956 ++collend;
8957 }
8958
8959 if (ignore) {
8960 i = collend;
8961 }
8962 else {
8963 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8964 reason, input, &exc,
8965 collstart, collend, &newpos);
8966 if (repunicode == NULL)
8967 goto onError;
8968 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008970 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008971 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008972 Py_DECREF(repunicode);
8973 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008974 }
8975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976 Py_XDECREF(exc);
8977 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008978 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008982 Py_XDECREF(exc);
8983 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 return NULL;
8985}
8986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987/* Deprecated. Use PyUnicode_Translate instead. */
8988PyObject *
8989PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8990 Py_ssize_t size,
8991 PyObject *mapping,
8992 const char *errors)
8993{
Christian Heimes5f520f42012-09-11 14:03:25 +02008994 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8996 if (!unicode)
8997 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008998 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8999 Py_DECREF(unicode);
9000 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001}
9002
Alexander Belopolsky40018472011-02-26 01:02:56 +00009003PyObject *
9004PyUnicode_Translate(PyObject *str,
9005 PyObject *mapping,
9006 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009008 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009009 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009010 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011}
Tim Petersced69f82003-09-16 20:30:58 +00009012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009014fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015{
9016 /* No need to call PyUnicode_READY(self) because this function is only
9017 called as a callback from fixup() which does it already. */
9018 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9019 const int kind = PyUnicode_KIND(self);
9020 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009021 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009022 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 Py_ssize_t i;
9024
9025 for (i = 0; i < len; ++i) {
9026 ch = PyUnicode_READ(kind, data, i);
9027 fixed = 0;
9028 if (ch > 127) {
9029 if (Py_UNICODE_ISSPACE(ch))
9030 fixed = ' ';
9031 else {
9032 const int decimal = Py_UNICODE_TODECIMAL(ch);
9033 if (decimal >= 0)
9034 fixed = '0' + decimal;
9035 }
9036 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009037 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009038 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 PyUnicode_WRITE(kind, data, i, fixed);
9040 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009041 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009042 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 }
9045
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009046 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047}
9048
9049PyObject *
9050_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9051{
9052 if (!PyUnicode_Check(unicode)) {
9053 PyErr_BadInternalCall();
9054 return NULL;
9055 }
9056 if (PyUnicode_READY(unicode) == -1)
9057 return NULL;
9058 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9059 /* If the string is already ASCII, just return the same string */
9060 Py_INCREF(unicode);
9061 return unicode;
9062 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009063 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064}
9065
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009066PyObject *
9067PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9068 Py_ssize_t length)
9069{
Victor Stinnerf0124502011-11-21 23:12:56 +01009070 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009071 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009072 Py_UCS4 maxchar;
9073 enum PyUnicode_Kind kind;
9074 void *data;
9075
Victor Stinner99d7ad02012-02-22 13:37:39 +01009076 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009077 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009078 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009079 if (ch > 127) {
9080 int decimal = Py_UNICODE_TODECIMAL(ch);
9081 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009082 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009083 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084 }
9085 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009086
9087 /* Copy to a new string */
9088 decimal = PyUnicode_New(length, maxchar);
9089 if (decimal == NULL)
9090 return decimal;
9091 kind = PyUnicode_KIND(decimal);
9092 data = PyUnicode_DATA(decimal);
9093 /* Iterate over code points */
9094 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009095 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009096 if (ch > 127) {
9097 int decimal = Py_UNICODE_TODECIMAL(ch);
9098 if (decimal >= 0)
9099 ch = '0' + decimal;
9100 }
9101 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009103 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009104}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009105/* --- Decimal Encoder ---------------------------------------------------- */
9106
Alexander Belopolsky40018472011-02-26 01:02:56 +00009107int
9108PyUnicode_EncodeDecimal(Py_UNICODE *s,
9109 Py_ssize_t length,
9110 char *output,
9111 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009112{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009113 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009114 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009115 enum PyUnicode_Kind kind;
9116 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009117
9118 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 PyErr_BadArgument();
9120 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009121 }
9122
Victor Stinner42bf7752011-11-21 22:52:58 +01009123 unicode = PyUnicode_FromUnicode(s, length);
9124 if (unicode == NULL)
9125 return -1;
9126
Benjamin Petersonbac79492012-01-14 13:34:47 -05009127 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009128 Py_DECREF(unicode);
9129 return -1;
9130 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009131 kind = PyUnicode_KIND(unicode);
9132 data = PyUnicode_DATA(unicode);
9133
Victor Stinnerb84d7232011-11-22 01:50:07 +01009134 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009135 PyObject *exc;
9136 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009138 Py_ssize_t startpos;
9139
9140 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009141
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009143 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009144 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 decimal = Py_UNICODE_TODECIMAL(ch);
9148 if (decimal >= 0) {
9149 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009150 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 continue;
9152 }
9153 if (0 < ch && ch < 256) {
9154 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009155 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 continue;
9157 }
Victor Stinner6345be92011-11-25 20:09:01 +01009158
Victor Stinner42bf7752011-11-21 22:52:58 +01009159 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009160 exc = NULL;
9161 raise_encode_exception(&exc, "decimal", unicode,
9162 startpos, startpos+1,
9163 "invalid decimal Unicode string");
9164 Py_XDECREF(exc);
9165 Py_DECREF(unicode);
9166 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009167 }
9168 /* 0-terminate the output string */
9169 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009170 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009171 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009172}
9173
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174/* --- Helpers ------------------------------------------------------------ */
9175
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009176/* helper macro to fixup start/end slice values */
9177#define ADJUST_INDICES(start, end, len) \
9178 if (end > len) \
9179 end = len; \
9180 else if (end < 0) { \
9181 end += len; \
9182 if (end < 0) \
9183 end = 0; \
9184 } \
9185 if (start < 0) { \
9186 start += len; \
9187 if (start < 0) \
9188 start = 0; \
9189 }
9190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009192any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009194 Py_ssize_t end,
9195 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009197 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 void *buf1, *buf2;
9199 Py_ssize_t len1, len2, result;
9200
9201 kind1 = PyUnicode_KIND(s1);
9202 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009203 if (kind1 < kind2)
9204 return -1;
9205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 len1 = PyUnicode_GET_LENGTH(s1);
9207 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009208 ADJUST_INDICES(start, end, len1);
9209 if (end - start < len2)
9210 return -1;
9211
9212 buf1 = PyUnicode_DATA(s1);
9213 buf2 = PyUnicode_DATA(s2);
9214 if (len2 == 1) {
9215 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9216 result = findchar((const char *)buf1 + kind1*start,
9217 kind1, end - start, ch, direction);
9218 if (result == -1)
9219 return -1;
9220 else
9221 return start + result;
9222 }
9223
9224 if (kind2 != kind1) {
9225 buf2 = _PyUnicode_AsKind(s2, kind1);
9226 if (!buf2)
9227 return -2;
9228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229
Victor Stinner794d5672011-10-10 03:21:36 +02009230 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009231 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009232 case PyUnicode_1BYTE_KIND:
9233 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9234 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9235 else
9236 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9237 break;
9238 case PyUnicode_2BYTE_KIND:
9239 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9240 break;
9241 case PyUnicode_4BYTE_KIND:
9242 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9243 break;
9244 default:
9245 assert(0); result = -2;
9246 }
9247 }
9248 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009250 case PyUnicode_1BYTE_KIND:
9251 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9252 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9253 else
9254 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9255 break;
9256 case PyUnicode_2BYTE_KIND:
9257 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9258 break;
9259 case PyUnicode_4BYTE_KIND:
9260 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9261 break;
9262 default:
9263 assert(0); result = -2;
9264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 }
9266
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009267 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 PyMem_Free(buf2);
9269
9270 return result;
9271}
9272
9273Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009274_PyUnicode_InsertThousandsGrouping(
9275 PyObject *unicode, Py_ssize_t index,
9276 Py_ssize_t n_buffer,
9277 void *digits, Py_ssize_t n_digits,
9278 Py_ssize_t min_width,
9279 const char *grouping, PyObject *thousands_sep,
9280 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281{
Victor Stinner41a863c2012-02-24 00:37:51 +01009282 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009283 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009284 Py_ssize_t thousands_sep_len;
9285 Py_ssize_t len;
9286
9287 if (unicode != NULL) {
9288 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009289 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009290 }
9291 else {
9292 kind = PyUnicode_1BYTE_KIND;
9293 data = NULL;
9294 }
9295 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9296 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9297 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9298 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009299 if (thousands_sep_kind < kind) {
9300 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9301 if (!thousands_sep_data)
9302 return -1;
9303 }
9304 else {
9305 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9306 if (!data)
9307 return -1;
9308 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009309 }
9310
Benjamin Petersonead6b532011-12-20 17:23:42 -06009311 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009313 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009315 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009317 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009318 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009319 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009320 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009321 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009322 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009323 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009325 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009326 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009327 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009328 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009329 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009332 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009333 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009334 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009335 break;
9336 default:
9337 assert(0);
9338 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009340 if (unicode != NULL && thousands_sep_kind != kind) {
9341 if (thousands_sep_kind < kind)
9342 PyMem_Free(thousands_sep_data);
9343 else
9344 PyMem_Free(data);
9345 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009346 if (unicode == NULL) {
9347 *maxchar = 127;
9348 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009349 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009350 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009351 }
9352 }
9353 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354}
9355
9356
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357Py_ssize_t
9358PyUnicode_Count(PyObject *str,
9359 PyObject *substr,
9360 Py_ssize_t start,
9361 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009363 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009364 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 void *buf1 = NULL, *buf2 = NULL;
9366 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009367
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009368 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009370
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009371 kind1 = PyUnicode_KIND(str);
9372 kind2 = PyUnicode_KIND(substr);
9373 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009374 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009375
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009376 len1 = PyUnicode_GET_LENGTH(str);
9377 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009379 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009380 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009381
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009382 buf1 = PyUnicode_DATA(str);
9383 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009384 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009385 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009386 if (!buf2)
9387 goto onError;
9388 }
9389
9390 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009392 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009393 result = asciilib_count(
9394 ((Py_UCS1*)buf1) + start, end - start,
9395 buf2, len2, PY_SSIZE_T_MAX
9396 );
9397 else
9398 result = ucs1lib_count(
9399 ((Py_UCS1*)buf1) + start, end - start,
9400 buf2, len2, PY_SSIZE_T_MAX
9401 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 break;
9403 case PyUnicode_2BYTE_KIND:
9404 result = ucs2lib_count(
9405 ((Py_UCS2*)buf1) + start, end - start,
9406 buf2, len2, PY_SSIZE_T_MAX
9407 );
9408 break;
9409 case PyUnicode_4BYTE_KIND:
9410 result = ucs4lib_count(
9411 ((Py_UCS4*)buf1) + start, end - start,
9412 buf2, len2, PY_SSIZE_T_MAX
9413 );
9414 break;
9415 default:
9416 assert(0); result = 0;
9417 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009418
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009419 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 PyMem_Free(buf2);
9421
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009424 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 PyMem_Free(buf2);
9426 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427}
9428
Alexander Belopolsky40018472011-02-26 01:02:56 +00009429Py_ssize_t
9430PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009431 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009432 Py_ssize_t start,
9433 Py_ssize_t end,
9434 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009436 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009438
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440}
9441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442Py_ssize_t
9443PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9444 Py_ssize_t start, Py_ssize_t end,
9445 int direction)
9446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009448 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (PyUnicode_READY(str) == -1)
9450 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009451 if (start < 0 || end < 0) {
9452 PyErr_SetString(PyExc_IndexError, "string index out of range");
9453 return -2;
9454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (end > PyUnicode_GET_LENGTH(str))
9456 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009457 if (start >= end)
9458 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009460 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9461 kind, end-start, ch, direction);
9462 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009464 else
9465 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466}
9467
Alexander Belopolsky40018472011-02-26 01:02:56 +00009468static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009469tailmatch(PyObject *self,
9470 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009471 Py_ssize_t start,
9472 Py_ssize_t end,
9473 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 int kind_self;
9476 int kind_sub;
9477 void *data_self;
9478 void *data_sub;
9479 Py_ssize_t offset;
9480 Py_ssize_t i;
9481 Py_ssize_t end_sub;
9482
9483 if (PyUnicode_READY(self) == -1 ||
9484 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009485 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9488 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009492 if (PyUnicode_GET_LENGTH(substring) == 0)
9493 return 1;
9494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 kind_self = PyUnicode_KIND(self);
9496 data_self = PyUnicode_DATA(self);
9497 kind_sub = PyUnicode_KIND(substring);
9498 data_sub = PyUnicode_DATA(substring);
9499 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9500
9501 if (direction > 0)
9502 offset = end;
9503 else
9504 offset = start;
9505
9506 if (PyUnicode_READ(kind_self, data_self, offset) ==
9507 PyUnicode_READ(kind_sub, data_sub, 0) &&
9508 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9509 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9510 /* If both are of the same kind, memcmp is sufficient */
9511 if (kind_self == kind_sub) {
9512 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009513 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 data_sub,
9515 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009516 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009518 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 else {
9520 /* We do not need to compare 0 and len(substring)-1 because
9521 the if statement above ensured already that they are equal
9522 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 for (i = 1; i < end_sub; ++i) {
9524 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9525 PyUnicode_READ(kind_sub, data_sub, i))
9526 return 0;
9527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 }
9531
9532 return 0;
9533}
9534
Alexander Belopolsky40018472011-02-26 01:02:56 +00009535Py_ssize_t
9536PyUnicode_Tailmatch(PyObject *str,
9537 PyObject *substr,
9538 Py_ssize_t start,
9539 Py_ssize_t end,
9540 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009542 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009544
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009545 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546}
9547
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548/* Apply fixfct filter to the Unicode object self and return a
9549 reference to the modified object */
9550
Alexander Belopolsky40018472011-02-26 01:02:56 +00009551static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009552fixup(PyObject *self,
9553 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 PyObject *u;
9556 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009557 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009559 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009562 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 /* fix functions return the new maximum character in a string,
9565 if the kind of the resulting unicode object does not change,
9566 everything is fine. Otherwise we need to change the string kind
9567 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009568 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009569
9570 if (maxchar_new == 0) {
9571 /* no changes */;
9572 if (PyUnicode_CheckExact(self)) {
9573 Py_DECREF(u);
9574 Py_INCREF(self);
9575 return self;
9576 }
9577 else
9578 return u;
9579 }
9580
Victor Stinnere6abb482012-05-02 01:15:40 +02009581 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582
Victor Stinnereaab6042011-12-11 22:22:39 +01009583 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009585
9586 /* In case the maximum character changed, we need to
9587 convert the string to the new category. */
9588 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9589 if (v == NULL) {
9590 Py_DECREF(u);
9591 return NULL;
9592 }
9593 if (maxchar_new > maxchar_old) {
9594 /* If the maxchar increased so that the kind changed, not all
9595 characters are representable anymore and we need to fix the
9596 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009597 _PyUnicode_FastCopyCharacters(v, 0,
9598 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009599 maxchar_old = fixfct(v);
9600 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 }
9602 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009603 _PyUnicode_FastCopyCharacters(v, 0,
9604 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009606 Py_DECREF(u);
9607 assert(_PyUnicode_CheckConsistency(v, 1));
9608 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609}
9610
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009611static PyObject *
9612ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009614 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9615 char *resdata, *data = PyUnicode_DATA(self);
9616 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009617
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618 res = PyUnicode_New(len, 127);
9619 if (res == NULL)
9620 return NULL;
9621 resdata = PyUnicode_DATA(res);
9622 if (lower)
9623 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009625 _Py_bytes_upper(resdata, data, len);
9626 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627}
9628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632 Py_ssize_t j;
9633 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009634 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009635 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009636
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9638
9639 where ! is a negation and \p{xxx} is a character with property xxx.
9640 */
9641 for (j = i - 1; j >= 0; j--) {
9642 c = PyUnicode_READ(kind, data, j);
9643 if (!_PyUnicode_IsCaseIgnorable(c))
9644 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9647 if (final_sigma) {
9648 for (j = i + 1; j < length; j++) {
9649 c = PyUnicode_READ(kind, data, j);
9650 if (!_PyUnicode_IsCaseIgnorable(c))
9651 break;
9652 }
9653 final_sigma = j == length || !_PyUnicode_IsCased(c);
9654 }
9655 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656}
9657
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009658static int
9659lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9660 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 /* Obscure special case. */
9663 if (c == 0x3A3) {
9664 mapped[0] = handle_capital_sigma(kind, data, length, i);
9665 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668}
9669
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009670static Py_ssize_t
9671do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673 Py_ssize_t i, k = 0;
9674 int n_res, j;
9675 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009676
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009677 c = PyUnicode_READ(kind, data, 0);
9678 n_res = _PyUnicode_ToUpperFull(c, mapped);
9679 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009680 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 for (i = 1; i < length; i++) {
9684 c = PyUnicode_READ(kind, data, i);
9685 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9686 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009687 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009689 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009690 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009691 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692}
9693
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694static Py_ssize_t
9695do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9696 Py_ssize_t i, k = 0;
9697
9698 for (i = 0; i < length; i++) {
9699 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9700 int n_res, j;
9701 if (Py_UNICODE_ISUPPER(c)) {
9702 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9703 }
9704 else if (Py_UNICODE_ISLOWER(c)) {
9705 n_res = _PyUnicode_ToUpperFull(c, mapped);
9706 }
9707 else {
9708 n_res = 1;
9709 mapped[0] = c;
9710 }
9711 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009712 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 res[k++] = mapped[j];
9714 }
9715 }
9716 return k;
9717}
9718
9719static Py_ssize_t
9720do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9721 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009723 Py_ssize_t i, k = 0;
9724
9725 for (i = 0; i < length; i++) {
9726 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9727 int n_res, j;
9728 if (lower)
9729 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9730 else
9731 n_res = _PyUnicode_ToUpperFull(c, mapped);
9732 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009733 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 res[k++] = mapped[j];
9735 }
9736 }
9737 return k;
9738}
9739
9740static Py_ssize_t
9741do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9742{
9743 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9744}
9745
9746static Py_ssize_t
9747do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9748{
9749 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9750}
9751
Benjamin Petersone51757f2012-01-12 21:10:29 -05009752static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009753do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9754{
9755 Py_ssize_t i, k = 0;
9756
9757 for (i = 0; i < length; i++) {
9758 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9759 Py_UCS4 mapped[3];
9760 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9761 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009762 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009763 res[k++] = mapped[j];
9764 }
9765 }
9766 return k;
9767}
9768
9769static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009770do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9771{
9772 Py_ssize_t i, k = 0;
9773 int previous_is_cased;
9774
9775 previous_is_cased = 0;
9776 for (i = 0; i < length; i++) {
9777 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9778 Py_UCS4 mapped[3];
9779 int n_res, j;
9780
9781 if (previous_is_cased)
9782 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9783 else
9784 n_res = _PyUnicode_ToTitleFull(c, mapped);
9785
9786 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009787 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009788 res[k++] = mapped[j];
9789 }
9790
9791 previous_is_cased = _PyUnicode_IsCased(c);
9792 }
9793 return k;
9794}
9795
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009796static PyObject *
9797case_operation(PyObject *self,
9798 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9799{
9800 PyObject *res = NULL;
9801 Py_ssize_t length, newlength = 0;
9802 int kind, outkind;
9803 void *data, *outdata;
9804 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9805
Benjamin Petersoneea48462012-01-16 14:28:50 -05009806 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009807
9808 kind = PyUnicode_KIND(self);
9809 data = PyUnicode_DATA(self);
9810 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009811 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009812 PyErr_SetString(PyExc_OverflowError, "string is too long");
9813 return NULL;
9814 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009815 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 if (tmp == NULL)
9817 return PyErr_NoMemory();
9818 newlength = perform(kind, data, length, tmp, &maxchar);
9819 res = PyUnicode_New(newlength, maxchar);
9820 if (res == NULL)
9821 goto leave;
9822 tmpend = tmp + newlength;
9823 outdata = PyUnicode_DATA(res);
9824 outkind = PyUnicode_KIND(res);
9825 switch (outkind) {
9826 case PyUnicode_1BYTE_KIND:
9827 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9828 break;
9829 case PyUnicode_2BYTE_KIND:
9830 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9831 break;
9832 case PyUnicode_4BYTE_KIND:
9833 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9834 break;
9835 default:
9836 assert(0);
9837 break;
9838 }
9839 leave:
9840 PyMem_FREE(tmp);
9841 return res;
9842}
9843
Tim Peters8ce9f162004-08-27 01:49:32 +00009844PyObject *
9845PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009848 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009850 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009851 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9852 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009853 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009855 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009857 int use_memcpy;
9858 unsigned char *res_data = NULL, *sep_data = NULL;
9859 PyObject *last_obj;
9860 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009862 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009863 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009864 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009865 }
9866
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009867 /* NOTE: the following code can't call back into Python code,
9868 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009869 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009870
Tim Peters05eba1f2004-08-27 21:32:02 +00009871 seqlen = PySequence_Fast_GET_SIZE(fseq);
9872 /* If empty sequence, return u"". */
9873 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009874 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009875 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009876 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009877
Tim Peters05eba1f2004-08-27 21:32:02 +00009878 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009879 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009880 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009881 if (seqlen == 1) {
9882 if (PyUnicode_CheckExact(items[0])) {
9883 res = items[0];
9884 Py_INCREF(res);
9885 Py_DECREF(fseq);
9886 return res;
9887 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009888 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009889 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009890 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009891 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009892 /* Set up sep and seplen */
9893 if (separator == NULL) {
9894 /* fall back to a blank space separator */
9895 sep = PyUnicode_FromOrdinal(' ');
9896 if (!sep)
9897 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009898 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009899 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009900 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009901 else {
9902 if (!PyUnicode_Check(separator)) {
9903 PyErr_Format(PyExc_TypeError,
9904 "separator: expected str instance,"
9905 " %.80s found",
9906 Py_TYPE(separator)->tp_name);
9907 goto onError;
9908 }
9909 if (PyUnicode_READY(separator))
9910 goto onError;
9911 sep = separator;
9912 seplen = PyUnicode_GET_LENGTH(separator);
9913 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9914 /* inc refcount to keep this code path symmetric with the
9915 above case of a blank separator */
9916 Py_INCREF(sep);
9917 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009918 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009919 }
9920
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009921 /* There are at least two things to join, or else we have a subclass
9922 * of str in the sequence.
9923 * Do a pre-pass to figure out the total amount of space we'll
9924 * need (sz), and see whether all argument are strings.
9925 */
9926 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009927#ifdef Py_DEBUG
9928 use_memcpy = 0;
9929#else
9930 use_memcpy = 1;
9931#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009932 for (i = 0; i < seqlen; i++) {
9933 const Py_ssize_t old_sz = sz;
9934 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 if (!PyUnicode_Check(item)) {
9936 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009937 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 " %.80s found",
9939 i, Py_TYPE(item)->tp_name);
9940 goto onError;
9941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (PyUnicode_READY(item) == -1)
9943 goto onError;
9944 sz += PyUnicode_GET_LENGTH(item);
9945 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009946 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009947 if (i != 0)
9948 sz += seplen;
9949 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9950 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009952 goto onError;
9953 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009954 if (use_memcpy && last_obj != NULL) {
9955 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9956 use_memcpy = 0;
9957 }
9958 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009959 }
Tim Petersced69f82003-09-16 20:30:58 +00009960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009962 if (res == NULL)
9963 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009964
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009965 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009966#ifdef Py_DEBUG
9967 use_memcpy = 0;
9968#else
9969 if (use_memcpy) {
9970 res_data = PyUnicode_1BYTE_DATA(res);
9971 kind = PyUnicode_KIND(res);
9972 if (seplen != 0)
9973 sep_data = PyUnicode_1BYTE_DATA(sep);
9974 }
9975#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009976 if (use_memcpy) {
9977 for (i = 0; i < seqlen; ++i) {
9978 Py_ssize_t itemlen;
9979 item = items[i];
9980
9981 /* Copy item, and maybe the separator. */
9982 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009983 Py_MEMCPY(res_data,
9984 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 kind * seplen);
9986 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009987 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009988
9989 itemlen = PyUnicode_GET_LENGTH(item);
9990 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009991 Py_MEMCPY(res_data,
9992 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009993 kind * itemlen);
9994 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009995 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009996 }
9997 assert(res_data == PyUnicode_1BYTE_DATA(res)
9998 + kind * PyUnicode_GET_LENGTH(res));
9999 }
10000 else {
10001 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10002 Py_ssize_t itemlen;
10003 item = items[i];
10004
10005 /* Copy item, and maybe the separator. */
10006 if (i && seplen != 0) {
10007 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10008 res_offset += seplen;
10009 }
10010
10011 itemlen = PyUnicode_GET_LENGTH(item);
10012 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010013 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010014 res_offset += itemlen;
10015 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010016 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010017 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010018 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010019
Tim Peters05eba1f2004-08-27 21:32:02 +000010020 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010022 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010026 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010028 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029 return NULL;
10030}
10031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032#define FILL(kind, data, value, start, length) \
10033 do { \
10034 Py_ssize_t i_ = 0; \
10035 assert(kind != PyUnicode_WCHAR_KIND); \
10036 switch ((kind)) { \
10037 case PyUnicode_1BYTE_KIND: { \
10038 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010039 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 break; \
10041 } \
10042 case PyUnicode_2BYTE_KIND: { \
10043 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10044 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10045 break; \
10046 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010047 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10049 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10050 break; \
10051 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +020010052 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 } \
10054 } while (0)
10055
Victor Stinnerd3f08822012-05-29 12:57:52 +020010056void
10057_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10058 Py_UCS4 fill_char)
10059{
10060 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10061 const void *data = PyUnicode_DATA(unicode);
10062 assert(PyUnicode_IS_READY(unicode));
10063 assert(unicode_modifiable(unicode));
10064 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10065 assert(start >= 0);
10066 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10067 FILL(kind, data, fill_char, start, length);
10068}
10069
Victor Stinner3fe55312012-01-04 00:33:50 +010010070Py_ssize_t
10071PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10072 Py_UCS4 fill_char)
10073{
10074 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010075
10076 if (!PyUnicode_Check(unicode)) {
10077 PyErr_BadInternalCall();
10078 return -1;
10079 }
10080 if (PyUnicode_READY(unicode) == -1)
10081 return -1;
10082 if (unicode_check_modifiable(unicode))
10083 return -1;
10084
Victor Stinnerd3f08822012-05-29 12:57:52 +020010085 if (start < 0) {
10086 PyErr_SetString(PyExc_IndexError, "string index out of range");
10087 return -1;
10088 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010089 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10090 PyErr_SetString(PyExc_ValueError,
10091 "fill character is bigger than "
10092 "the string maximum character");
10093 return -1;
10094 }
10095
10096 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10097 length = Py_MIN(maxlen, length);
10098 if (length <= 0)
10099 return 0;
10100
Victor Stinnerd3f08822012-05-29 12:57:52 +020010101 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010102 return length;
10103}
10104
Victor Stinner9310abb2011-10-05 00:59:23 +020010105static PyObject *
10106pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010107 Py_ssize_t left,
10108 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 PyObject *u;
10112 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010113 int kind;
10114 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
10116 if (left < 0)
10117 left = 0;
10118 if (right < 0)
10119 right = 0;
10120
Victor Stinnerc4b49542011-12-11 22:44:26 +010010121 if (left == 0 && right == 0)
10122 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10125 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010126 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10127 return NULL;
10128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010130 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010132 if (!u)
10133 return NULL;
10134
10135 kind = PyUnicode_KIND(u);
10136 data = PyUnicode_DATA(u);
10137 if (left)
10138 FILL(kind, data, fill, 0, left);
10139 if (right)
10140 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010141 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010142 assert(_PyUnicode_CheckConsistency(u, 1));
10143 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144}
10145
Alexander Belopolsky40018472011-02-26 01:02:56 +000010146PyObject *
10147PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010151 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
Benjamin Petersonead6b532011-12-20 17:23:42 -060010154 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010156 if (PyUnicode_IS_ASCII(string))
10157 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010158 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010159 PyUnicode_GET_LENGTH(string), keepends);
10160 else
10161 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010162 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010163 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 break;
10165 case PyUnicode_2BYTE_KIND:
10166 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 PyUnicode_GET_LENGTH(string), keepends);
10169 break;
10170 case PyUnicode_4BYTE_KIND:
10171 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 PyUnicode_GET_LENGTH(string), keepends);
10174 break;
10175 default:
10176 assert(0);
10177 list = 0;
10178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180}
10181
Alexander Belopolsky40018472011-02-26 01:02:56 +000010182static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010183split(PyObject *self,
10184 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010185 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010187 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 void *buf1, *buf2;
10189 Py_ssize_t len1, len2;
10190 PyObject* out;
10191
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010193 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 if (PyUnicode_READY(self) == -1)
10196 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010199 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010201 if (PyUnicode_IS_ASCII(self))
10202 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 PyUnicode_GET_LENGTH(self), maxcount
10205 );
10206 else
10207 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010208 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 PyUnicode_GET_LENGTH(self), maxcount
10210 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 case PyUnicode_2BYTE_KIND:
10212 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 PyUnicode_GET_LENGTH(self), maxcount
10215 );
10216 case PyUnicode_4BYTE_KIND:
10217 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 PyUnicode_GET_LENGTH(self), maxcount
10220 );
10221 default:
10222 assert(0);
10223 return NULL;
10224 }
10225
10226 if (PyUnicode_READY(substring) == -1)
10227 return NULL;
10228
10229 kind1 = PyUnicode_KIND(self);
10230 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 len1 = PyUnicode_GET_LENGTH(self);
10232 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010233 if (kind1 < kind2 || len1 < len2) {
10234 out = PyList_New(1);
10235 if (out == NULL)
10236 return NULL;
10237 Py_INCREF(self);
10238 PyList_SET_ITEM(out, 0, self);
10239 return out;
10240 }
10241 buf1 = PyUnicode_DATA(self);
10242 buf2 = PyUnicode_DATA(substring);
10243 if (kind2 != kind1) {
10244 buf2 = _PyUnicode_AsKind(substring, kind1);
10245 if (!buf2)
10246 return NULL;
10247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010249 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10252 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 else
10255 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010256 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 break;
10258 case PyUnicode_2BYTE_KIND:
10259 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010260 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 break;
10262 case PyUnicode_4BYTE_KIND:
10263 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 break;
10266 default:
10267 out = NULL;
10268 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010269 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 PyMem_Free(buf2);
10271 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272}
10273
Alexander Belopolsky40018472011-02-26 01:02:56 +000010274static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010275rsplit(PyObject *self,
10276 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010277 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010278{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010279 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 void *buf1, *buf2;
10281 Py_ssize_t len1, len2;
10282 PyObject* out;
10283
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010284 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010285 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 if (PyUnicode_READY(self) == -1)
10288 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010291 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010293 if (PyUnicode_IS_ASCII(self))
10294 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 PyUnicode_GET_LENGTH(self), maxcount
10297 );
10298 else
10299 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010301 PyUnicode_GET_LENGTH(self), maxcount
10302 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 case PyUnicode_2BYTE_KIND:
10304 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 PyUnicode_GET_LENGTH(self), maxcount
10307 );
10308 case PyUnicode_4BYTE_KIND:
10309 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010310 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 PyUnicode_GET_LENGTH(self), maxcount
10312 );
10313 default:
10314 assert(0);
10315 return NULL;
10316 }
10317
10318 if (PyUnicode_READY(substring) == -1)
10319 return NULL;
10320
10321 kind1 = PyUnicode_KIND(self);
10322 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 len1 = PyUnicode_GET_LENGTH(self);
10324 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010325 if (kind1 < kind2 || len1 < len2) {
10326 out = PyList_New(1);
10327 if (out == NULL)
10328 return NULL;
10329 Py_INCREF(self);
10330 PyList_SET_ITEM(out, 0, self);
10331 return out;
10332 }
10333 buf1 = PyUnicode_DATA(self);
10334 buf2 = PyUnicode_DATA(substring);
10335 if (kind2 != kind1) {
10336 buf2 = _PyUnicode_AsKind(substring, kind1);
10337 if (!buf2)
10338 return NULL;
10339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010341 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010343 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10344 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 else
10347 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 break;
10350 case PyUnicode_2BYTE_KIND:
10351 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 break;
10354 case PyUnicode_4BYTE_KIND:
10355 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010356 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 break;
10358 default:
10359 out = NULL;
10360 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010361 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 PyMem_Free(buf2);
10363 return out;
10364}
10365
10366static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10368 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010370 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10373 return asciilib_find(buf1, len1, buf2, len2, offset);
10374 else
10375 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 case PyUnicode_2BYTE_KIND:
10377 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10378 case PyUnicode_4BYTE_KIND:
10379 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10380 }
10381 assert(0);
10382 return -1;
10383}
10384
10385static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10387 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010389 switch (kind) {
10390 case PyUnicode_1BYTE_KIND:
10391 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10392 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10393 else
10394 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10395 case PyUnicode_2BYTE_KIND:
10396 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10397 case PyUnicode_4BYTE_KIND:
10398 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10399 }
10400 assert(0);
10401 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010402}
10403
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010404static void
10405replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10406 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10407{
10408 int kind = PyUnicode_KIND(u);
10409 void *data = PyUnicode_DATA(u);
10410 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10411 if (kind == PyUnicode_1BYTE_KIND) {
10412 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10413 (Py_UCS1 *)data + len,
10414 u1, u2, maxcount);
10415 }
10416 else if (kind == PyUnicode_2BYTE_KIND) {
10417 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10418 (Py_UCS2 *)data + len,
10419 u1, u2, maxcount);
10420 }
10421 else {
10422 assert(kind == PyUnicode_4BYTE_KIND);
10423 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10424 (Py_UCS4 *)data + len,
10425 u1, u2, maxcount);
10426 }
10427}
10428
Alexander Belopolsky40018472011-02-26 01:02:56 +000010429static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430replace(PyObject *self, PyObject *str1,
10431 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 PyObject *u;
10434 char *sbuf = PyUnicode_DATA(self);
10435 char *buf1 = PyUnicode_DATA(str1);
10436 char *buf2 = PyUnicode_DATA(str2);
10437 int srelease = 0, release1 = 0, release2 = 0;
10438 int skind = PyUnicode_KIND(self);
10439 int kind1 = PyUnicode_KIND(str1);
10440 int kind2 = PyUnicode_KIND(str2);
10441 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10442 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10443 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010444 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010445 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446
10447 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010450 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
Victor Stinner59de0ee2011-10-07 10:01:28 +020010452 if (str1 == str2)
10453 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454
Victor Stinner49a0a212011-10-12 23:46:10 +020010455 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10457 if (maxchar < maxchar_str1)
10458 /* substring too wide to be present */
10459 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010460 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10461 /* Replacing str1 with str2 may cause a maxchar reduction in the
10462 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010463 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010464 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010469 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010472 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010473 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010474
Victor Stinner69ed0f42013-04-09 21:48:24 +020010475 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010476 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010477 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010478 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010479 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010481 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010483
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010484 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10485 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010486 }
10487 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 int rkind = skind;
10489 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010490 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 if (kind1 < rkind) {
10493 /* widen substring */
10494 buf1 = _PyUnicode_AsKind(str1, rkind);
10495 if (!buf1) goto error;
10496 release1 = 1;
10497 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010498 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 if (i < 0)
10500 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (rkind > kind2) {
10502 /* widen replacement */
10503 buf2 = _PyUnicode_AsKind(str2, rkind);
10504 if (!buf2) goto error;
10505 release2 = 1;
10506 }
10507 else if (rkind < kind2) {
10508 /* widen self and buf1 */
10509 rkind = kind2;
10510 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010511 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 sbuf = _PyUnicode_AsKind(self, rkind);
10513 if (!sbuf) goto error;
10514 srelease = 1;
10515 buf1 = _PyUnicode_AsKind(str1, rkind);
10516 if (!buf1) goto error;
10517 release1 = 1;
10518 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010519 u = PyUnicode_New(slen, maxchar);
10520 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010522 assert(PyUnicode_KIND(u) == rkind);
10523 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010524
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010525 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010526 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010527 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010529 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010531
10532 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010533 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010534 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010535 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010536 if (i == -1)
10537 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010538 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010540 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010544 }
10545 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010547 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 int rkind = skind;
10549 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010552 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 buf1 = _PyUnicode_AsKind(str1, rkind);
10554 if (!buf1) goto error;
10555 release1 = 1;
10556 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010558 if (n == 0)
10559 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010561 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 buf2 = _PyUnicode_AsKind(str2, rkind);
10563 if (!buf2) goto error;
10564 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 rkind = kind2;
10569 sbuf = _PyUnicode_AsKind(self, rkind);
10570 if (!sbuf) goto error;
10571 srelease = 1;
10572 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010573 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf1 = _PyUnicode_AsKind(str1, rkind);
10575 if (!buf1) goto error;
10576 release1 = 1;
10577 }
10578 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10579 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010580 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 PyErr_SetString(PyExc_OverflowError,
10582 "replace string is too long");
10583 goto error;
10584 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010585 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010587 _Py_INCREF_UNICODE_EMPTY();
10588 if (!unicode_empty)
10589 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010590 u = unicode_empty;
10591 goto done;
10592 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010593 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 PyErr_SetString(PyExc_OverflowError,
10595 "replace string is too long");
10596 goto error;
10597 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 u = PyUnicode_New(new_size, maxchar);
10599 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010601 assert(PyUnicode_KIND(u) == rkind);
10602 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 ires = i = 0;
10604 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605 while (n-- > 0) {
10606 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010607 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010608 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010609 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010610 if (j == -1)
10611 break;
10612 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010614 memcpy(res + rkind * ires,
10615 sbuf + rkind * i,
10616 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 }
10619 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010621 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010630 memcpy(res + rkind * ires,
10631 sbuf + rkind * i,
10632 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010633 }
10634 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635 /* interleave */
10636 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010637 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 if (--n <= 0)
10642 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010643 memcpy(res + rkind * ires,
10644 sbuf + rkind * i,
10645 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 ires++;
10647 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010649 memcpy(res + rkind * ires,
10650 sbuf + rkind * i,
10651 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010653 }
10654
10655 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010656 unicode_adjust_maxchar(&u);
10657 if (u == NULL)
10658 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010660
10661 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 if (srelease)
10663 PyMem_FREE(sbuf);
10664 if (release1)
10665 PyMem_FREE(buf1);
10666 if (release2)
10667 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010668 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (srelease)
10674 PyMem_FREE(sbuf);
10675 if (release1)
10676 PyMem_FREE(buf1);
10677 if (release2)
10678 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010679 return unicode_result_unchanged(self);
10680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 error:
10682 if (srelease && sbuf)
10683 PyMem_FREE(sbuf);
10684 if (release1 && buf1)
10685 PyMem_FREE(buf1);
10686 if (release2 && buf2)
10687 PyMem_FREE(buf2);
10688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689}
10690
10691/* --- Unicode Object Methods --------------------------------------------- */
10692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010693PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695\n\
10696Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010697characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698
10699static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010700unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010702 if (PyUnicode_READY(self) == -1)
10703 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010704 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705}
10706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010707PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010708 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709\n\
10710Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010711have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
10713static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010714unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010716 if (PyUnicode_READY(self) == -1)
10717 return NULL;
10718 if (PyUnicode_GET_LENGTH(self) == 0)
10719 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010720 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721}
10722
Benjamin Petersond5890c82012-01-14 13:23:30 -050010723PyDoc_STRVAR(casefold__doc__,
10724 "S.casefold() -> str\n\
10725\n\
10726Return a version of S suitable for caseless comparisons.");
10727
10728static PyObject *
10729unicode_casefold(PyObject *self)
10730{
10731 if (PyUnicode_READY(self) == -1)
10732 return NULL;
10733 if (PyUnicode_IS_ASCII(self))
10734 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010735 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010736}
10737
10738
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010739/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010740
10741static int
10742convert_uc(PyObject *obj, void *addr)
10743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010745
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010746 if (!PyUnicode_Check(obj)) {
10747 PyErr_Format(PyExc_TypeError,
10748 "The fill character must be a unicode character, "
10749 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010750 return 0;
10751 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010752 if (PyUnicode_READY(obj) < 0)
10753 return 0;
10754 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010755 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010757 return 0;
10758 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010759 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010760 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010761}
10762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010763PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010766Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010767done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
10769static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010770unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010772 Py_ssize_t marg, left;
10773 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 Py_UCS4 fillchar = ' ';
10775
Victor Stinnere9a29352011-10-01 02:14:59 +020010776 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
Benjamin Petersonbac79492012-01-14 13:34:47 -050010779 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 return NULL;
10781
Victor Stinnerc4b49542011-12-11 22:44:26 +010010782 if (PyUnicode_GET_LENGTH(self) >= width)
10783 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Victor Stinnerc4b49542011-12-11 22:44:26 +010010785 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 left = marg / 2 + (marg & width & 1);
10787
Victor Stinner9310abb2011-10-05 00:59:23 +020010788 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789}
10790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791/* This function assumes that str1 and str2 are readied by the caller. */
10792
Marc-André Lemburge5034372000-08-08 08:04:29 +000010793static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010794unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010795{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010796#define COMPARE(TYPE1, TYPE2) \
10797 do { \
10798 TYPE1* p1 = (TYPE1 *)data1; \
10799 TYPE2* p2 = (TYPE2 *)data2; \
10800 TYPE1* end = p1 + len; \
10801 Py_UCS4 c1, c2; \
10802 for (; p1 != end; p1++, p2++) { \
10803 c1 = *p1; \
10804 c2 = *p2; \
10805 if (c1 != c2) \
10806 return (c1 < c2) ? -1 : 1; \
10807 } \
10808 } \
10809 while (0)
10810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 int kind1, kind2;
10812 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010813 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 kind1 = PyUnicode_KIND(str1);
10816 kind2 = PyUnicode_KIND(str2);
10817 data1 = PyUnicode_DATA(str1);
10818 data2 = PyUnicode_DATA(str2);
10819 len1 = PyUnicode_GET_LENGTH(str1);
10820 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010821 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010822
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010823 switch(kind1) {
10824 case PyUnicode_1BYTE_KIND:
10825 {
10826 switch(kind2) {
10827 case PyUnicode_1BYTE_KIND:
10828 {
10829 int cmp = memcmp(data1, data2, len);
10830 /* normalize result of memcmp() into the range [-1; 1] */
10831 if (cmp < 0)
10832 return -1;
10833 if (cmp > 0)
10834 return 1;
10835 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010836 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010837 case PyUnicode_2BYTE_KIND:
10838 COMPARE(Py_UCS1, Py_UCS2);
10839 break;
10840 case PyUnicode_4BYTE_KIND:
10841 COMPARE(Py_UCS1, Py_UCS4);
10842 break;
10843 default:
10844 assert(0);
10845 }
10846 break;
10847 }
10848 case PyUnicode_2BYTE_KIND:
10849 {
10850 switch(kind2) {
10851 case PyUnicode_1BYTE_KIND:
10852 COMPARE(Py_UCS2, Py_UCS1);
10853 break;
10854 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010855 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010856 COMPARE(Py_UCS2, Py_UCS2);
10857 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010858 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010859 case PyUnicode_4BYTE_KIND:
10860 COMPARE(Py_UCS2, Py_UCS4);
10861 break;
10862 default:
10863 assert(0);
10864 }
10865 break;
10866 }
10867 case PyUnicode_4BYTE_KIND:
10868 {
10869 switch(kind2) {
10870 case PyUnicode_1BYTE_KIND:
10871 COMPARE(Py_UCS4, Py_UCS1);
10872 break;
10873 case PyUnicode_2BYTE_KIND:
10874 COMPARE(Py_UCS4, Py_UCS2);
10875 break;
10876 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010877 {
10878#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10879 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10880 /* normalize result of wmemcmp() into the range [-1; 1] */
10881 if (cmp < 0)
10882 return -1;
10883 if (cmp > 0)
10884 return 1;
10885#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010886 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010887#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010888 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010889 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010890 default:
10891 assert(0);
10892 }
10893 break;
10894 }
10895 default:
10896 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010897 }
10898
Victor Stinner770e19e2012-10-04 22:59:45 +020010899 if (len1 == len2)
10900 return 0;
10901 if (len1 < len2)
10902 return -1;
10903 else
10904 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010905
10906#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010907}
10908
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010909Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010910unicode_compare_eq(PyObject *str1, PyObject *str2)
10911{
10912 int kind;
10913 void *data1, *data2;
10914 Py_ssize_t len;
10915 int cmp;
10916
Victor Stinnere5567ad2012-10-23 02:48:49 +020010917 len = PyUnicode_GET_LENGTH(str1);
10918 if (PyUnicode_GET_LENGTH(str2) != len)
10919 return 0;
10920 kind = PyUnicode_KIND(str1);
10921 if (PyUnicode_KIND(str2) != kind)
10922 return 0;
10923 data1 = PyUnicode_DATA(str1);
10924 data2 = PyUnicode_DATA(str2);
10925
10926 cmp = memcmp(data1, data2, len * kind);
10927 return (cmp == 0);
10928}
10929
10930
Alexander Belopolsky40018472011-02-26 01:02:56 +000010931int
10932PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10935 if (PyUnicode_READY(left) == -1 ||
10936 PyUnicode_READY(right) == -1)
10937 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010938
10939 /* a string is equal to itself */
10940 if (left == right)
10941 return 0;
10942
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010943 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010945 PyErr_Format(PyExc_TypeError,
10946 "Can't compare %.100s and %.100s",
10947 left->ob_type->tp_name,
10948 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949 return -1;
10950}
10951
Martin v. Löwis5b222132007-06-10 09:51:05 +000010952int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010953_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10954{
10955 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10956 if (right_str == NULL)
10957 return -1;
10958 return PyUnicode_Compare(left, right_str);
10959}
10960
10961int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010962PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 Py_ssize_t i;
10965 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 Py_UCS4 chr;
10967
Victor Stinner910337b2011-10-03 03:20:16 +020010968 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (PyUnicode_READY(uni) == -1)
10970 return -1;
10971 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010972 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010973 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010974 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010975 size_t len, len2 = strlen(str);
10976 int cmp;
10977
10978 len = Py_MIN(len1, len2);
10979 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010980 if (cmp != 0) {
10981 if (cmp < 0)
10982 return -1;
10983 else
10984 return 1;
10985 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010986 if (len1 > len2)
10987 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010988 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010989 return -1; /* str is longer */
10990 return 0;
10991 }
10992 else {
10993 void *data = PyUnicode_DATA(uni);
10994 /* Compare Unicode string and source character set string */
10995 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010996 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010997 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10998 /* This check keeps Python strings that end in '\0' from comparing equal
10999 to C strings identical up to that point. */
11000 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11001 return 1; /* uni is longer */
11002 if (str[i])
11003 return -1; /* str is longer */
11004 return 0;
11005 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011006}
11007
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011008
Benjamin Peterson29060642009-01-31 22:14:21 +000011009#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000011010 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011011
Alexander Belopolsky40018472011-02-26 01:02:56 +000011012PyObject *
11013PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011014{
11015 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011016 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011017
Victor Stinnere5567ad2012-10-23 02:48:49 +020011018 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11019 Py_RETURN_NOTIMPLEMENTED;
11020
11021 if (PyUnicode_READY(left) == -1 ||
11022 PyUnicode_READY(right) == -1)
11023 return NULL;
11024
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011025 if (left == right) {
11026 switch (op) {
11027 case Py_EQ:
11028 case Py_LE:
11029 case Py_GE:
11030 /* a string is equal to itself */
11031 v = Py_True;
11032 break;
11033 case Py_NE:
11034 case Py_LT:
11035 case Py_GT:
11036 v = Py_False;
11037 break;
11038 default:
11039 PyErr_BadArgument();
11040 return NULL;
11041 }
11042 }
11043 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011044 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011045 result ^= (op == Py_NE);
11046 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011047 }
11048 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011049 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011050
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011051 /* Convert the return value to a Boolean */
11052 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011053 case Py_LE:
11054 v = TEST_COND(result <= 0);
11055 break;
11056 case Py_GE:
11057 v = TEST_COND(result >= 0);
11058 break;
11059 case Py_LT:
11060 v = TEST_COND(result == -1);
11061 break;
11062 case Py_GT:
11063 v = TEST_COND(result == 1);
11064 break;
11065 default:
11066 PyErr_BadArgument();
11067 return NULL;
11068 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011069 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020011070 Py_INCREF(v);
11071 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011072}
11073
Alexander Belopolsky40018472011-02-26 01:02:56 +000011074int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011075_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11076{
11077 return unicode_eq(aa, bb);
11078}
11079
11080int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011081PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011082{
Victor Stinner77282cb2013-04-14 19:22:47 +020011083 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 void *buf1, *buf2;
11085 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011086 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011087
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011088 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011089 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011090 "'in <string>' requires string as left operand, not %.100s",
11091 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011093 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011094 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011095 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011096 if (ensure_unicode(str) < 0)
11097 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011100 kind2 = PyUnicode_KIND(substr);
11101 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011102 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011104 len2 = PyUnicode_GET_LENGTH(substr);
11105 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011106 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011107 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011108 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011109 if (len2 == 1) {
11110 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11111 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011112 return result;
11113 }
11114 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011115 buf2 = _PyUnicode_AsKind(substr, kind1);
11116 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011117 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119
Victor Stinner77282cb2013-04-14 19:22:47 +020011120 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 case PyUnicode_1BYTE_KIND:
11122 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11123 break;
11124 case PyUnicode_2BYTE_KIND:
11125 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11126 break;
11127 case PyUnicode_4BYTE_KIND:
11128 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11129 break;
11130 default:
11131 result = -1;
11132 assert(0);
11133 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134
Victor Stinner77282cb2013-04-14 19:22:47 +020011135 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 PyMem_Free(buf2);
11137
Guido van Rossum403d68b2000-03-13 15:55:09 +000011138 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011139}
11140
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141/* Concat to string or Unicode object giving a new Unicode object. */
11142
Alexander Belopolsky40018472011-02-26 01:02:56 +000011143PyObject *
11144PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011146 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011147 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011150 if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0)
11151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152
11153 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011154 if (left == unicode_empty)
11155 return PyUnicode_FromObject(right);
11156 if (right == unicode_empty)
11157 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011159 left_len = PyUnicode_GET_LENGTH(left);
11160 right_len = PyUnicode_GET_LENGTH(right);
11161 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011162 PyErr_SetString(PyExc_OverflowError,
11163 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011164 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011165 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011166 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011167
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011168 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11169 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011170 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011173 result = PyUnicode_New(new_len, maxchar);
11174 if (result == NULL)
11175 return NULL;
11176 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11177 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11178 assert(_PyUnicode_CheckConsistency(result, 1));
11179 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180}
11181
Walter Dörwald1ab83302007-05-18 17:15:44 +000011182void
Victor Stinner23e56682011-10-03 03:54:37 +020011183PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011184{
Victor Stinner23e56682011-10-03 03:54:37 +020011185 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011186 Py_UCS4 maxchar, maxchar2;
11187 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011188
11189 if (p_left == NULL) {
11190 if (!PyErr_Occurred())
11191 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011192 return;
11193 }
Victor Stinner23e56682011-10-03 03:54:37 +020011194 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011195 if (right == NULL || left == NULL
11196 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011197 if (!PyErr_Occurred())
11198 PyErr_BadInternalCall();
11199 goto error;
11200 }
11201
Benjamin Petersonbac79492012-01-14 13:34:47 -050011202 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011203 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011204 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011205 goto error;
11206
Victor Stinner488fa492011-12-12 00:01:39 +010011207 /* Shortcuts */
11208 if (left == unicode_empty) {
11209 Py_DECREF(left);
11210 Py_INCREF(right);
11211 *p_left = right;
11212 return;
11213 }
11214 if (right == unicode_empty)
11215 return;
11216
11217 left_len = PyUnicode_GET_LENGTH(left);
11218 right_len = PyUnicode_GET_LENGTH(right);
11219 if (left_len > PY_SSIZE_T_MAX - right_len) {
11220 PyErr_SetString(PyExc_OverflowError,
11221 "strings are too large to concat");
11222 goto error;
11223 }
11224 new_len = left_len + right_len;
11225
11226 if (unicode_modifiable(left)
11227 && PyUnicode_CheckExact(right)
11228 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011229 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11230 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011231 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011232 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011233 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11234 {
11235 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011236 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011237 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011238
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011239 /* copy 'right' into the newly allocated area of 'left' */
11240 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011241 }
Victor Stinner488fa492011-12-12 00:01:39 +010011242 else {
11243 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11244 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011245 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011246
Victor Stinner488fa492011-12-12 00:01:39 +010011247 /* Concat the two Unicode strings */
11248 res = PyUnicode_New(new_len, maxchar);
11249 if (res == NULL)
11250 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011251 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11252 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011253 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011254 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011255 }
11256 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011257 return;
11258
11259error:
Victor Stinner488fa492011-12-12 00:01:39 +010011260 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011261}
11262
11263void
11264PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11265{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011266 PyUnicode_Append(pleft, right);
11267 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011268}
11269
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011270/*
11271Wraps stringlib_parse_args_finds() and additionally ensures that the
11272first argument is a unicode object.
11273*/
11274
11275Py_LOCAL_INLINE(int)
11276parse_args_finds_unicode(const char * function_name, PyObject *args,
11277 PyObject **substring,
11278 Py_ssize_t *start, Py_ssize_t *end)
11279{
11280 if(stringlib_parse_args_finds(function_name, args, substring,
11281 start, end)) {
11282 if (ensure_unicode(*substring) < 0)
11283 return 0;
11284 return 1;
11285 }
11286 return 0;
11287}
11288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011289PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011292Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011293string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011294interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011297unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011299 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011300 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011301 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011303 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 void *buf1, *buf2;
11305 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011307 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 kind1 = PyUnicode_KIND(self);
11311 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011312 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011313 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 len1 = PyUnicode_GET_LENGTH(self);
11316 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011319 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011320
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011321 buf1 = PyUnicode_DATA(self);
11322 buf2 = PyUnicode_DATA(substring);
11323 if (kind2 != kind1) {
11324 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011325 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011326 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011327 }
11328 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 case PyUnicode_1BYTE_KIND:
11330 iresult = ucs1lib_count(
11331 ((Py_UCS1*)buf1) + start, end - start,
11332 buf2, len2, PY_SSIZE_T_MAX
11333 );
11334 break;
11335 case PyUnicode_2BYTE_KIND:
11336 iresult = ucs2lib_count(
11337 ((Py_UCS2*)buf1) + start, end - start,
11338 buf2, len2, PY_SSIZE_T_MAX
11339 );
11340 break;
11341 case PyUnicode_4BYTE_KIND:
11342 iresult = ucs4lib_count(
11343 ((Py_UCS4*)buf1) + start, end - start,
11344 buf2, len2, PY_SSIZE_T_MAX
11345 );
11346 break;
11347 default:
11348 assert(0); iresult = 0;
11349 }
11350
11351 result = PyLong_FromSsize_t(iresult);
11352
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011353 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356 return result;
11357}
11358
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011359PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011360 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011362Encode S using the codec registered for encoding. Default encoding\n\
11363is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011364handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011365a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11366'xmlcharrefreplace' as well as any other name registered with\n\
11367codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
11369static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011370unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011372 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 char *encoding = NULL;
11374 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011375
Benjamin Peterson308d6372009-09-18 21:42:35 +000011376 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11377 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011379 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011380}
11381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011382PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011383 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384\n\
11385Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011386If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387
11388static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011389unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011391 Py_ssize_t i, j, line_pos, src_len, incr;
11392 Py_UCS4 ch;
11393 PyObject *u;
11394 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011395 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011397 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011398 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399
Ezio Melotti745d54d2013-11-16 19:10:57 +020011400 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11401 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403
Antoine Pitrou22425222011-10-04 19:10:51 +020011404 if (PyUnicode_READY(self) == -1)
11405 return NULL;
11406
Thomas Wouters7e474022000-07-16 12:04:32 +000011407 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011408 src_len = PyUnicode_GET_LENGTH(self);
11409 i = j = line_pos = 0;
11410 kind = PyUnicode_KIND(self);
11411 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011412 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011413 for (; i < src_len; i++) {
11414 ch = PyUnicode_READ(kind, src_data, i);
11415 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011416 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011418 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011420 goto overflow;
11421 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011423 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011427 goto overflow;
11428 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011430 if (ch == '\n' || ch == '\r')
11431 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011433 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011434 if (!found)
11435 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011436
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011438 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 if (!u)
11440 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011441 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Antoine Pitroue71d5742011-10-04 15:55:09 +020011443 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
Antoine Pitroue71d5742011-10-04 15:55:09 +020011445 for (; i < src_len; i++) {
11446 ch = PyUnicode_READ(kind, src_data, i);
11447 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011449 incr = tabsize - (line_pos % tabsize);
11450 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011451 FILL(kind, dest_data, ' ', j, incr);
11452 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011454 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011456 line_pos++;
11457 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011458 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011459 if (ch == '\n' || ch == '\r')
11460 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011462 }
11463 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011464 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011465
Antoine Pitroue71d5742011-10-04 15:55:09 +020011466 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011467 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469}
11470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011471PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473\n\
11474Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011475such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476arguments start and end are interpreted as in slice notation.\n\
11477\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011478Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
11480static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011483 /* initialize variables to prevent gcc warning */
11484 PyObject *substring = NULL;
11485 Py_ssize_t start = 0;
11486 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011487 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011489 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011492 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011495 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (result == -2)
11498 return NULL;
11499
Christian Heimes217cfd12007-12-02 14:31:20 +000011500 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501}
11502
11503static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011504unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011506 void *data;
11507 enum PyUnicode_Kind kind;
11508 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011509
11510 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11511 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011513 }
11514 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11515 PyErr_SetString(PyExc_IndexError, "string index out of range");
11516 return NULL;
11517 }
11518 kind = PyUnicode_KIND(self);
11519 data = PyUnicode_DATA(self);
11520 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011521 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522}
11523
Guido van Rossumc2504932007-09-18 19:42:40 +000011524/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011525 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011526static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011527unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528{
Guido van Rossumc2504932007-09-18 19:42:40 +000011529 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011530 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011531
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011532#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011533 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011534#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (_PyUnicode_HASH(self) != -1)
11536 return _PyUnicode_HASH(self);
11537 if (PyUnicode_READY(self) == -1)
11538 return -1;
11539 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011540 /*
11541 We make the hash of the empty string be 0, rather than using
11542 (prefix ^ suffix), since this slightly obfuscates the hash secret
11543 */
11544 if (len == 0) {
11545 _PyUnicode_HASH(self) = 0;
11546 return 0;
11547 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011548 x = _Py_HashBytes(PyUnicode_DATA(self),
11549 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011551 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552}
11553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011554PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011557Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558
11559static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011562 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011563 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011564 PyObject *substring = NULL;
11565 Py_ssize_t start = 0;
11566 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011568 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011571 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011574 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 if (result == -2)
11577 return NULL;
11578
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579 if (result < 0) {
11580 PyErr_SetString(PyExc_ValueError, "substring not found");
11581 return NULL;
11582 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011583
Christian Heimes217cfd12007-12-02 14:31:20 +000011584 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585}
11586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011587PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011590Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011591at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
11593static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011594unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 Py_ssize_t i, length;
11597 int kind;
11598 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 int cased;
11600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 if (PyUnicode_READY(self) == -1)
11602 return NULL;
11603 length = PyUnicode_GET_LENGTH(self);
11604 kind = PyUnicode_KIND(self);
11605 data = PyUnicode_DATA(self);
11606
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 if (length == 1)
11609 return PyBool_FromLong(
11610 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011612 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011615
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 for (i = 0; i < length; i++) {
11618 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011619
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11621 return PyBool_FromLong(0);
11622 else if (!cased && Py_UNICODE_ISLOWER(ch))
11623 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011625 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626}
11627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011631Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
11634static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011635unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 Py_ssize_t i, length;
11638 int kind;
11639 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 int cased;
11641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 if (PyUnicode_READY(self) == -1)
11643 return NULL;
11644 length = PyUnicode_GET_LENGTH(self);
11645 kind = PyUnicode_KIND(self);
11646 data = PyUnicode_DATA(self);
11647
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (length == 1)
11650 return PyBool_FromLong(
11651 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011653 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011656
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 for (i = 0; i < length; i++) {
11659 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011660
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11662 return PyBool_FromLong(0);
11663 else if (!cased && Py_UNICODE_ISUPPER(ch))
11664 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011666 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667}
11668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011669PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011672Return True if S is a titlecased string and there is at least one\n\
11673character in S, i.e. upper- and titlecase characters may only\n\
11674follow uncased characters and lowercase characters only cased ones.\n\
11675Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
11677static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011678unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 Py_ssize_t i, length;
11681 int kind;
11682 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 int cased, previous_is_cased;
11684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (PyUnicode_READY(self) == -1)
11686 return NULL;
11687 length = PyUnicode_GET_LENGTH(self);
11688 kind = PyUnicode_KIND(self);
11689 data = PyUnicode_DATA(self);
11690
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (length == 1) {
11693 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11694 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11695 (Py_UNICODE_ISUPPER(ch) != 0));
11696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011698 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011701
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 cased = 0;
11703 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 for (i = 0; i < length; i++) {
11705 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011706
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11708 if (previous_is_cased)
11709 return PyBool_FromLong(0);
11710 previous_is_cased = 1;
11711 cased = 1;
11712 }
11713 else if (Py_UNICODE_ISLOWER(ch)) {
11714 if (!previous_is_cased)
11715 return PyBool_FromLong(0);
11716 previous_is_cased = 1;
11717 cased = 1;
11718 }
11719 else
11720 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011722 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723}
11724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011725PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011728Return True if all characters in S are whitespace\n\
11729and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
11731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
11737
11738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740 length = PyUnicode_GET_LENGTH(self);
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 1)
11746 return PyBool_FromLong(
11747 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 for (i = 0; i < length; i++) {
11754 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011755 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011758 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011763\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011764Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011766
11767static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 Py_ssize_t i, length;
11771 int kind;
11772 void *data;
11773
11774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776 length = PyUnicode_GET_LENGTH(self);
11777 kind = PyUnicode_KIND(self);
11778 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011779
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011780 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 1)
11782 return PyBool_FromLong(
11783 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011784
11785 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 for (i = 0; i < length; i++) {
11790 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011793 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011794}
11795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011796PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011798\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011799Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011801
11802static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011803unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 int kind;
11806 void *data;
11807 Py_ssize_t len, i;
11808
11809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811
11812 kind = PyUnicode_KIND(self);
11813 data = PyUnicode_DATA(self);
11814 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011815
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011816 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (len == 1) {
11818 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11819 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11820 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011821
11822 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 for (i = 0; i < len; i++) {
11827 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011828 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011831 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011832}
11833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011837Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011838False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
11840static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011841unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 Py_ssize_t i, length;
11844 int kind;
11845 void *data;
11846
11847 if (PyUnicode_READY(self) == -1)
11848 return NULL;
11849 length = PyUnicode_GET_LENGTH(self);
11850 kind = PyUnicode_KIND(self);
11851 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (length == 1)
11855 return PyBool_FromLong(
11856 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 for (i = 0; i < length; i++) {
11863 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011866 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011869PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011872Return True if all characters in S are digits\n\
11873and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874
11875static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011876unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 Py_ssize_t i, length;
11879 int kind;
11880 void *data;
11881
11882 if (PyUnicode_READY(self) == -1)
11883 return NULL;
11884 length = PyUnicode_GET_LENGTH(self);
11885 kind = PyUnicode_KIND(self);
11886 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (length == 1) {
11890 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11891 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 for (i = 0; i < length; i++) {
11899 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011908Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011909False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
11911static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011912unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
11917
11918 if (PyUnicode_READY(self) == -1)
11919 return NULL;
11920 length = PyUnicode_GET_LENGTH(self);
11921 kind = PyUnicode_KIND(self);
11922 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (length == 1)
11926 return PyBool_FromLong(
11927 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011929 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 for (i = 0; i < length; i++) {
11934 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011937 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
Martin v. Löwis47383402007-08-15 07:32:56 +000011940int
11941PyUnicode_IsIdentifier(PyObject *self)
11942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 int kind;
11944 void *data;
11945 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011946 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 if (PyUnicode_READY(self) == -1) {
11949 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 }
11952
11953 /* Special case for empty strings */
11954 if (PyUnicode_GET_LENGTH(self) == 0)
11955 return 0;
11956 kind = PyUnicode_KIND(self);
11957 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011958
11959 /* PEP 3131 says that the first character must be in
11960 XID_Start and subsequent characters in XID_Continue,
11961 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011962 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011963 letters, digits, underscore). However, given the current
11964 definition of XID_Start and XID_Continue, it is sufficient
11965 to check just for these, except that _ must be allowed
11966 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011968 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011969 return 0;
11970
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011971 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011974 return 1;
11975}
11976
11977PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011979\n\
11980Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011981to the language definition.\n\
11982\n\
11983Use keyword.iskeyword() to test for reserved identifiers\n\
11984such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011985
11986static PyObject*
11987unicode_isidentifier(PyObject *self)
11988{
11989 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11990}
11991
Georg Brandl559e5d72008-06-11 18:37:52 +000011992PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011994\n\
11995Return True if all characters in S are considered\n\
11996printable in repr() or S is empty, False otherwise.");
11997
11998static PyObject*
11999unicode_isprintable(PyObject *self)
12000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 Py_ssize_t i, length;
12002 int kind;
12003 void *data;
12004
12005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007 length = PyUnicode_GET_LENGTH(self);
12008 kind = PyUnicode_KIND(self);
12009 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012010
12011 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 if (length == 1)
12013 return PyBool_FromLong(
12014 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 for (i = 0; i < length; i++) {
12017 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012018 Py_RETURN_FALSE;
12019 }
12020 }
12021 Py_RETURN_TRUE;
12022}
12023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012024PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000012025 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026\n\
12027Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000012028iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
12030static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012031unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012033 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034}
12035
Martin v. Löwis18e16552006-02-15 17:27:45 +000012036static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012037unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (PyUnicode_READY(self) == -1)
12040 return -1;
12041 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042}
12043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012044PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012045 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012047Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012048done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
12050static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012051unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012053 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 Py_UCS4 fillchar = ' ';
12055
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012056 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057 return NULL;
12058
Benjamin Petersonbac79492012-01-14 13:34:47 -050012059 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
Victor Stinnerc4b49542011-12-11 22:44:26 +010012062 if (PyUnicode_GET_LENGTH(self) >= width)
12063 return unicode_result_unchanged(self);
12064
12065 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012071Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
12073static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012074unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012076 if (PyUnicode_READY(self) == -1)
12077 return NULL;
12078 if (PyUnicode_IS_ASCII(self))
12079 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012080 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081}
12082
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012083#define LEFTSTRIP 0
12084#define RIGHTSTRIP 1
12085#define BOTHSTRIP 2
12086
12087/* Arrays indexed by above */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +020012088static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012089
12090#define STRIPNAME(i) (stripformat[i]+3)
12091
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012092/* externally visible for str.strip(unicode) */
12093PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012094_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 void *data;
12097 int kind;
12098 Py_ssize_t i, j, len;
12099 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012100 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12103 return NULL;
12104
12105 kind = PyUnicode_KIND(self);
12106 data = PyUnicode_DATA(self);
12107 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012108 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12110 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012111 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012112
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 i = 0;
12114 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012115 while (i < len) {
12116 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12117 if (!BLOOM(sepmask, ch))
12118 break;
12119 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12120 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012121 i++;
12122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 j = len;
12126 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012127 j--;
12128 while (j >= i) {
12129 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12130 if (!BLOOM(sepmask, ch))
12131 break;
12132 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12133 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012135 }
12136
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139
Victor Stinner7931d9a2011-11-04 00:22:48 +010012140 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141}
12142
12143PyObject*
12144PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12145{
12146 unsigned char *data;
12147 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012148 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149
Victor Stinnerde636f32011-10-01 03:55:54 +020012150 if (PyUnicode_READY(self) == -1)
12151 return NULL;
12152
Victor Stinner684d5fd2012-05-03 02:32:34 +020012153 length = PyUnicode_GET_LENGTH(self);
12154 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012155
Victor Stinner684d5fd2012-05-03 02:32:34 +020012156 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012157 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158
Victor Stinnerde636f32011-10-01 03:55:54 +020012159 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012160 PyErr_SetString(PyExc_IndexError, "string index out of range");
12161 return NULL;
12162 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012163 if (start >= length || end < start)
12164 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012165
Victor Stinner684d5fd2012-05-03 02:32:34 +020012166 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012167 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012168 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012169 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012170 }
12171 else {
12172 kind = PyUnicode_KIND(self);
12173 data = PyUnicode_1BYTE_DATA(self);
12174 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012175 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012176 length);
12177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
12180static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012181do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 Py_ssize_t len, i, j;
12184
12185 if (PyUnicode_READY(self) == -1)
12186 return NULL;
12187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012189
Victor Stinnercc7af722013-04-09 22:39:24 +020012190 if (PyUnicode_IS_ASCII(self)) {
12191 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12192
12193 i = 0;
12194 if (striptype != RIGHTSTRIP) {
12195 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012196 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012197 if (!_Py_ascii_whitespace[ch])
12198 break;
12199 i++;
12200 }
12201 }
12202
12203 j = len;
12204 if (striptype != LEFTSTRIP) {
12205 j--;
12206 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012207 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012208 if (!_Py_ascii_whitespace[ch])
12209 break;
12210 j--;
12211 }
12212 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012213 }
12214 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012215 else {
12216 int kind = PyUnicode_KIND(self);
12217 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012218
Victor Stinnercc7af722013-04-09 22:39:24 +020012219 i = 0;
12220 if (striptype != RIGHTSTRIP) {
12221 while (i < len) {
12222 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12223 if (!Py_UNICODE_ISSPACE(ch))
12224 break;
12225 i++;
12226 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012227 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012228
12229 j = len;
12230 if (striptype != LEFTSTRIP) {
12231 j--;
12232 while (j >= i) {
12233 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12234 if (!Py_UNICODE_ISSPACE(ch))
12235 break;
12236 j--;
12237 }
12238 j++;
12239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012240 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012241
Victor Stinner7931d9a2011-11-04 00:22:48 +010012242 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243}
12244
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012245
12246static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012247do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012250
Serhiy Storchakac6792272013-10-19 21:03:34 +030012251 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012252 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012253
Benjamin Peterson14339b62009-01-31 16:36:08 +000012254 if (sep != NULL && sep != Py_None) {
12255 if (PyUnicode_Check(sep))
12256 return _PyUnicode_XStrip(self, striptype, sep);
12257 else {
12258 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 "%s arg must be None or str",
12260 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012261 return NULL;
12262 }
12263 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012264
Benjamin Peterson14339b62009-01-31 16:36:08 +000012265 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266}
12267
12268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012269PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012271\n\
12272Return a copy of the string S with leading and trailing\n\
12273whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012274If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275
12276static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012277unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012278{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012279 if (PyTuple_GET_SIZE(args) == 0)
12280 return do_strip(self, BOTHSTRIP); /* Common case */
12281 else
12282 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012283}
12284
12285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012286PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288\n\
12289Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012290If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291
12292static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012293unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012294{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012295 if (PyTuple_GET_SIZE(args) == 0)
12296 return do_strip(self, LEFTSTRIP); /* Common case */
12297 else
12298 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012299}
12300
12301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012302PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304\n\
12305Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012306If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
12308static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012309unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 if (PyTuple_GET_SIZE(args) == 0)
12312 return do_strip(self, RIGHTSTRIP); /* Common case */
12313 else
12314 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012315}
12316
12317
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012319unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012321 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323
Serhiy Storchaka05997252013-01-26 12:14:02 +020012324 if (len < 1)
12325 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326
Victor Stinnerc4b49542011-12-11 22:44:26 +010012327 /* no repeat, return original string */
12328 if (len == 1)
12329 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012330
Benjamin Petersonbac79492012-01-14 13:34:47 -050012331 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 return NULL;
12333
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012334 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012335 PyErr_SetString(PyExc_OverflowError,
12336 "repeated string is too long");
12337 return NULL;
12338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012340
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012341 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342 if (!u)
12343 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012344 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 if (PyUnicode_GET_LENGTH(str) == 1) {
12347 const int kind = PyUnicode_KIND(str);
12348 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012349 if (kind == PyUnicode_1BYTE_KIND) {
12350 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012351 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012352 }
12353 else if (kind == PyUnicode_2BYTE_KIND) {
12354 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012355 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012356 ucs2[n] = fill_char;
12357 } else {
12358 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12359 assert(kind == PyUnicode_4BYTE_KIND);
12360 for (n = 0; n < len; ++n)
12361 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 }
12364 else {
12365 /* number of characters copied this far */
12366 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012367 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 char *to = (char *) PyUnicode_DATA(u);
12369 Py_MEMCPY(to, PyUnicode_DATA(str),
12370 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 n = (done <= nchars-done) ? done : nchars-done;
12373 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012374 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 }
12377
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012378 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012379 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380}
12381
Alexander Belopolsky40018472011-02-26 01:02:56 +000012382PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012383PyUnicode_Replace(PyObject *str,
12384 PyObject *substr,
12385 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012386 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012388 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12389 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012391 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392}
12393
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012394PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012395 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396\n\
12397Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012398old replaced by new. If the optional argument count is\n\
12399given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400
12401static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 PyObject *str1;
12405 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012406 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012408 if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012410 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012412 return replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413}
12414
Alexander Belopolsky40018472011-02-26 01:02:56 +000012415static PyObject *
12416unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012418 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 Py_ssize_t isize;
12420 Py_ssize_t osize, squote, dquote, i, o;
12421 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012422 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012426 return NULL;
12427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 isize = PyUnicode_GET_LENGTH(unicode);
12429 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 /* Compute length of output, quote characters, and
12432 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012433 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 max = 127;
12435 squote = dquote = 0;
12436 ikind = PyUnicode_KIND(unicode);
12437 for (i = 0; i < isize; i++) {
12438 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012439 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012441 case '\'': squote++; break;
12442 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012444 incr = 2;
12445 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 default:
12447 /* Fast-path ASCII */
12448 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012449 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012451 ;
12452 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012455 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012457 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012459 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012461 if (osize > PY_SSIZE_T_MAX - incr) {
12462 PyErr_SetString(PyExc_OverflowError,
12463 "string is too long to generate repr");
12464 return NULL;
12465 }
12466 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 }
12468
12469 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012470 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012472 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 if (dquote)
12474 /* Both squote and dquote present. Use squote,
12475 and escape them */
12476 osize += squote;
12477 else
12478 quote = '"';
12479 }
Victor Stinner55c08782013-04-14 18:45:39 +020012480 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481
12482 repr = PyUnicode_New(osize, max);
12483 if (repr == NULL)
12484 return NULL;
12485 okind = PyUnicode_KIND(repr);
12486 odata = PyUnicode_DATA(repr);
12487
12488 PyUnicode_WRITE(okind, odata, 0, quote);
12489 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012490 if (unchanged) {
12491 _PyUnicode_FastCopyCharacters(repr, 1,
12492 unicode, 0,
12493 isize);
12494 }
12495 else {
12496 for (i = 0, o = 1; i < isize; i++) {
12497 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498
Victor Stinner55c08782013-04-14 18:45:39 +020012499 /* Escape quotes and backslashes */
12500 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012501 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012503 continue;
12504 }
12505
12506 /* Map special whitespace to '\t', \n', '\r' */
12507 if (ch == '\t') {
12508 PyUnicode_WRITE(okind, odata, o++, '\\');
12509 PyUnicode_WRITE(okind, odata, o++, 't');
12510 }
12511 else if (ch == '\n') {
12512 PyUnicode_WRITE(okind, odata, o++, '\\');
12513 PyUnicode_WRITE(okind, odata, o++, 'n');
12514 }
12515 else if (ch == '\r') {
12516 PyUnicode_WRITE(okind, odata, o++, '\\');
12517 PyUnicode_WRITE(okind, odata, o++, 'r');
12518 }
12519
12520 /* Map non-printable US ASCII to '\xhh' */
12521 else if (ch < ' ' || ch == 0x7F) {
12522 PyUnicode_WRITE(okind, odata, o++, '\\');
12523 PyUnicode_WRITE(okind, odata, o++, 'x');
12524 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12525 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12526 }
12527
12528 /* Copy ASCII characters as-is */
12529 else if (ch < 0x7F) {
12530 PyUnicode_WRITE(okind, odata, o++, ch);
12531 }
12532
12533 /* Non-ASCII characters */
12534 else {
12535 /* Map Unicode whitespace and control characters
12536 (categories Z* and C* except ASCII space)
12537 */
12538 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12539 PyUnicode_WRITE(okind, odata, o++, '\\');
12540 /* Map 8-bit characters to '\xhh' */
12541 if (ch <= 0xff) {
12542 PyUnicode_WRITE(okind, odata, o++, 'x');
12543 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12544 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12545 }
12546 /* Map 16-bit characters to '\uxxxx' */
12547 else if (ch <= 0xffff) {
12548 PyUnicode_WRITE(okind, odata, o++, 'u');
12549 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12550 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12551 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12552 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12553 }
12554 /* Map 21-bit characters to '\U00xxxxxx' */
12555 else {
12556 PyUnicode_WRITE(okind, odata, o++, 'U');
12557 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12558 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12559 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12560 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12561 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12562 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12563 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12564 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12565 }
12566 }
12567 /* Copy characters as-is */
12568 else {
12569 PyUnicode_WRITE(okind, odata, o++, ch);
12570 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012571 }
12572 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012575 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012576 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012579PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581\n\
12582Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012583such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584arguments start and end are interpreted as in slice notation.\n\
12585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012586Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587
12588static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012591 /* initialize variables to prevent gcc warning */
12592 PyObject *substring = NULL;
12593 Py_ssize_t start = 0;
12594 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012595 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012597 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012600 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012603 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 if (result == -2)
12606 return NULL;
12607
Christian Heimes217cfd12007-12-02 14:31:20 +000012608 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609}
12610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012611PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012614Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615
12616static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012619 /* initialize variables to prevent gcc warning */
12620 PyObject *substring = NULL;
12621 Py_ssize_t start = 0;
12622 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012623 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012625 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012628 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012631 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 if (result == -2)
12634 return NULL;
12635
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 if (result < 0) {
12637 PyErr_SetString(PyExc_ValueError, "substring not found");
12638 return NULL;
12639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640
Christian Heimes217cfd12007-12-02 14:31:20 +000012641 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642}
12643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012644PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012647Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012648done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649
12650static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012651unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012653 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 Py_UCS4 fillchar = ' ';
12655
Victor Stinnere9a29352011-10-01 02:14:59 +020012656 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012658
Benjamin Petersonbac79492012-01-14 13:34:47 -050012659 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 return NULL;
12661
Victor Stinnerc4b49542011-12-11 22:44:26 +010012662 if (PyUnicode_GET_LENGTH(self) >= width)
12663 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664
Victor Stinnerc4b49542011-12-11 22:44:26 +010012665 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666}
12667
Alexander Belopolsky40018472011-02-26 01:02:56 +000012668PyObject *
12669PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012671 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012674 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675}
12676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012677PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012678 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679\n\
12680Return a list of the words in S, using sep as the\n\
12681delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012682splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012683whitespace string is a separator and empty strings are\n\
12684removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
12686static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012687unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012689 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012691 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012693 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12694 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695 return NULL;
12696
12697 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 return split(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012699
12700 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012701 return split(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012702
12703 PyErr_Format(PyExc_TypeError,
12704 "must be str or None, not %.100s",
12705 Py_TYPE(substring)->tp_name);
12706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707}
12708
Thomas Wouters477c8d52006-05-27 19:21:47 +000012709PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012710PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012712 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012713 int kind1, kind2;
12714 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012716
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012717 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012719
Victor Stinner14f8f022011-10-05 20:58:25 +020012720 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 len1 = PyUnicode_GET_LENGTH(str_obj);
12723 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012724 if (kind1 < kind2 || len1 < len2) {
12725 _Py_INCREF_UNICODE_EMPTY();
12726 if (!unicode_empty)
12727 out = NULL;
12728 else {
12729 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12730 Py_DECREF(unicode_empty);
12731 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012732 return out;
12733 }
12734 buf1 = PyUnicode_DATA(str_obj);
12735 buf2 = PyUnicode_DATA(sep_obj);
12736 if (kind2 != kind1) {
12737 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12738 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012739 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012742 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012744 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12745 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12746 else
12747 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 break;
12749 case PyUnicode_2BYTE_KIND:
12750 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12751 break;
12752 case PyUnicode_4BYTE_KIND:
12753 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12754 break;
12755 default:
12756 assert(0);
12757 out = 0;
12758 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012759
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012760 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012762
12763 return out;
12764}
12765
12766
12767PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012768PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012769{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012770 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012771 int kind1, kind2;
12772 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012774
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012775 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012776 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012778 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 len1 = PyUnicode_GET_LENGTH(str_obj);
12781 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012782 if (kind1 < kind2 || len1 < len2) {
12783 _Py_INCREF_UNICODE_EMPTY();
12784 if (!unicode_empty)
12785 out = NULL;
12786 else {
12787 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12788 Py_DECREF(unicode_empty);
12789 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012790 return out;
12791 }
12792 buf1 = PyUnicode_DATA(str_obj);
12793 buf2 = PyUnicode_DATA(sep_obj);
12794 if (kind2 != kind1) {
12795 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12796 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012797 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012800 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012802 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12803 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12804 else
12805 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 break;
12807 case PyUnicode_2BYTE_KIND:
12808 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12809 break;
12810 case PyUnicode_4BYTE_KIND:
12811 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12812 break;
12813 default:
12814 assert(0);
12815 out = 0;
12816 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012818 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012820
12821 return out;
12822}
12823
12824PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012827Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012829found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830
12831static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012832unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012833{
Victor Stinner9310abb2011-10-05 00:59:23 +020012834 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012835}
12836
12837PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012838 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012839\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012840Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012842separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843
12844static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012845unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012846{
Victor Stinner9310abb2011-10-05 00:59:23 +020012847 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012848}
12849
Alexander Belopolsky40018472011-02-26 01:02:56 +000012850PyObject *
12851PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012852{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012853 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012856 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857}
12858
12859PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012860 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012861\n\
12862Return a list of the words in S, using sep as the\n\
12863delimiter string, starting at the end of the string and\n\
12864working to the front. If maxsplit is given, at most maxsplit\n\
12865splits are done. If sep is not specified, any whitespace string\n\
12866is a separator.");
12867
12868static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012869unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012870{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012871 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012872 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012873 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012874
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012875 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12876 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012877 return NULL;
12878
12879 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 return rsplit(self, NULL, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012881
12882 if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012883 return rsplit(self, substring, maxcount);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012884
12885 PyErr_Format(PyExc_TypeError,
12886 "must be str or None, not %.100s",
12887 Py_TYPE(substring)->tp_name);
12888 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012889}
12890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012891PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893\n\
12894Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012895Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012896is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897
12898static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012899unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012901 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012902 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012904 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12905 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906 return NULL;
12907
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012908 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909}
12910
12911static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012912PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012914 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915}
12916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012917PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012918 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919\n\
12920Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012921and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922
12923static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012924unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012926 if (PyUnicode_READY(self) == -1)
12927 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012928 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929}
12930
Larry Hastings61272b72014-01-07 12:41:53 -080012931/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012932
Larry Hastings31826802013-10-19 00:09:25 -070012933@staticmethod
12934str.maketrans as unicode_maketrans
12935
12936 x: object
12937
12938 y: unicode=NULL
12939
12940 z: unicode=NULL
12941
12942 /
12943
12944Return a translation table usable for str.translate().
12945
12946If there is only one argument, it must be a dictionary mapping Unicode
12947ordinals (integers) or characters to Unicode ordinals, strings or None.
12948Character keys will be then converted to ordinals.
12949If there are two arguments, they must be strings of equal length, and
12950in the resulting dictionary, each character in x will be mapped to the
12951character at the same position in y. If there is a third argument, it
12952must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012953[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012954
Larry Hastings31826802013-10-19 00:09:25 -070012955static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012956unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012957/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012958{
Georg Brandlceee0772007-11-27 23:48:05 +000012959 PyObject *new = NULL, *key, *value;
12960 Py_ssize_t i = 0;
12961 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012962
Georg Brandlceee0772007-11-27 23:48:05 +000012963 new = PyDict_New();
12964 if (!new)
12965 return NULL;
12966 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 int x_kind, y_kind, z_kind;
12968 void *x_data, *y_data, *z_data;
12969
Georg Brandlceee0772007-11-27 23:48:05 +000012970 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012971 if (!PyUnicode_Check(x)) {
12972 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12973 "be a string if there is a second argument");
12974 goto err;
12975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012977 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12978 "arguments must have equal length");
12979 goto err;
12980 }
12981 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 x_kind = PyUnicode_KIND(x);
12983 y_kind = PyUnicode_KIND(y);
12984 x_data = PyUnicode_DATA(x);
12985 y_data = PyUnicode_DATA(y);
12986 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12987 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012988 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012989 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012990 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012991 if (!value) {
12992 Py_DECREF(key);
12993 goto err;
12994 }
Georg Brandlceee0772007-11-27 23:48:05 +000012995 res = PyDict_SetItem(new, key, value);
12996 Py_DECREF(key);
12997 Py_DECREF(value);
12998 if (res < 0)
12999 goto err;
13000 }
13001 /* create entries for deleting chars in z */
13002 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 z_kind = PyUnicode_KIND(z);
13004 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013005 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013007 if (!key)
13008 goto err;
13009 res = PyDict_SetItem(new, key, Py_None);
13010 Py_DECREF(key);
13011 if (res < 0)
13012 goto err;
13013 }
13014 }
13015 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 int kind;
13017 void *data;
13018
Georg Brandlceee0772007-11-27 23:48:05 +000013019 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013020 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013021 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13022 "to maketrans it must be a dict");
13023 goto err;
13024 }
13025 /* copy entries into the new dict, converting string keys to int keys */
13026 while (PyDict_Next(x, &i, &key, &value)) {
13027 if (PyUnicode_Check(key)) {
13028 /* convert string keys to integer keys */
13029 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013030 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013031 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13032 "table must be of length 1");
13033 goto err;
13034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 kind = PyUnicode_KIND(key);
13036 data = PyUnicode_DATA(key);
13037 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013038 if (!newkey)
13039 goto err;
13040 res = PyDict_SetItem(new, newkey, value);
13041 Py_DECREF(newkey);
13042 if (res < 0)
13043 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013044 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013045 /* just keep integer keys */
13046 if (PyDict_SetItem(new, key, value) < 0)
13047 goto err;
13048 } else {
13049 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13050 "be strings or integers");
13051 goto err;
13052 }
13053 }
13054 }
13055 return new;
13056 err:
13057 Py_DECREF(new);
13058 return NULL;
13059}
13060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013061PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013064Return a copy of the string S in which each character has been mapped\n\
13065through the given translation table. The table must implement\n\
13066lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13067mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13068this operation raises LookupError, the character is left untouched.\n\
13069Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070
13071static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075}
13076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013077PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013080Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081
13082static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013083unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013085 if (PyUnicode_READY(self) == -1)
13086 return NULL;
13087 if (PyUnicode_IS_ASCII(self))
13088 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013089 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090}
13091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013092PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013093 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013095Pad a numeric string S with zeros on the left, to fill a field\n\
13096of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097
13098static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013099unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013101 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013102 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013103 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 int kind;
13105 void *data;
13106 Py_UCS4 chr;
13107
Martin v. Löwis18e16552006-02-15 17:27:45 +000013108 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 return NULL;
13110
Benjamin Petersonbac79492012-01-14 13:34:47 -050013111 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Victor Stinnerc4b49542011-12-11 22:44:26 +010013114 if (PyUnicode_GET_LENGTH(self) >= width)
13115 return unicode_result_unchanged(self);
13116
13117 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
13119 u = pad(self, fill, 0, '0');
13120
Walter Dörwald068325e2002-04-15 13:36:47 +000013121 if (u == NULL)
13122 return NULL;
13123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 kind = PyUnicode_KIND(u);
13125 data = PyUnicode_DATA(u);
13126 chr = PyUnicode_READ(kind, data, fill);
13127
13128 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 PyUnicode_WRITE(kind, data, 0, chr);
13131 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132 }
13133
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013134 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013135 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137
13138#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013139static PyObject *
13140unicode__decimal2ascii(PyObject *self)
13141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013143}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144#endif
13145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013146PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013148\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013149Return True if S starts with the specified prefix, False otherwise.\n\
13150With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013151With optional end, stop comparing S at that position.\n\
13152prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153
13154static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013155unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013159 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013160 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013161 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163
Jesus Ceaac451502011-04-20 17:09:23 +020013164 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013166 if (PyTuple_Check(subobj)) {
13167 Py_ssize_t i;
13168 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013169 substring = PyTuple_GET_ITEM(subobj, i);
13170 if (!PyUnicode_Check(substring)) {
13171 PyErr_Format(PyExc_TypeError,
13172 "tuple for startswith must only contain str, "
13173 "not %.100s",
13174 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013175 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013176 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013177 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013178 if (result == -1)
13179 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013180 if (result) {
13181 Py_RETURN_TRUE;
13182 }
13183 }
13184 /* nothing matched */
13185 Py_RETURN_FALSE;
13186 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013187 if (!PyUnicode_Check(subobj)) {
13188 PyErr_Format(PyExc_TypeError,
13189 "startswith first arg must be str or "
13190 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013192 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013193 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013194 if (result == -1)
13195 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197}
13198
13199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013200PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013203Return True if S ends with the specified suffix, False otherwise.\n\
13204With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013205With optional end, stop comparing S at that position.\n\
13206suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207
13208static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013209unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013214 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013215 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013216 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
Jesus Ceaac451502011-04-20 17:09:23 +020013218 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013220 if (PyTuple_Check(subobj)) {
13221 Py_ssize_t i;
13222 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013223 substring = PyTuple_GET_ITEM(subobj, i);
13224 if (!PyUnicode_Check(substring)) {
13225 PyErr_Format(PyExc_TypeError,
13226 "tuple for endswith must only contain str, "
13227 "not %.100s",
13228 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013230 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013231 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013232 if (result == -1)
13233 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013234 if (result) {
13235 Py_RETURN_TRUE;
13236 }
13237 }
13238 Py_RETURN_FALSE;
13239 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013240 if (!PyUnicode_Check(subobj)) {
13241 PyErr_Format(PyExc_TypeError,
13242 "endswith first arg must be str or "
13243 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013244 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013245 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013246 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013247 if (result == -1)
13248 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013249 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250}
13251
Victor Stinner202fdca2012-05-07 12:47:02 +020013252Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013253_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013254{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013255 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13256 writer->data = PyUnicode_DATA(writer->buffer);
13257
13258 if (!writer->readonly) {
13259 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013260 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013261 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013262 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013263 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13264 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13265 writer->kind = PyUnicode_WCHAR_KIND;
13266 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13267
Victor Stinner8f674cc2013-04-17 23:02:17 +020013268 /* Copy-on-write mode: set buffer size to 0 so
13269 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13270 * next write. */
13271 writer->size = 0;
13272 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013273}
13274
Victor Stinnerd3f08822012-05-29 12:57:52 +020013275void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013276_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013277{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013278 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013279
13280 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013281 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013282
13283 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13284 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13285 writer->kind = PyUnicode_WCHAR_KIND;
13286 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013287}
13288
Victor Stinnerd3f08822012-05-29 12:57:52 +020013289int
13290_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13291 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013292{
13293 Py_ssize_t newlen;
13294 PyObject *newbuffer;
13295
Victor Stinnerca9381e2015-09-22 00:58:32 +020013296 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013297 assert((maxchar > writer->maxchar && length >= 0)
13298 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013299
Victor Stinner202fdca2012-05-07 12:47:02 +020013300 if (length > PY_SSIZE_T_MAX - writer->pos) {
13301 PyErr_NoMemory();
13302 return -1;
13303 }
13304 newlen = writer->pos + length;
13305
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013306 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013307
Victor Stinnerd3f08822012-05-29 12:57:52 +020013308 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013309 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013310 if (writer->overallocate
13311 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13312 /* overallocate to limit the number of realloc() */
13313 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013315 if (newlen < writer->min_length)
13316 newlen = writer->min_length;
13317
Victor Stinnerd3f08822012-05-29 12:57:52 +020013318 writer->buffer = PyUnicode_New(newlen, maxchar);
13319 if (writer->buffer == NULL)
13320 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013321 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013322 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013323 if (writer->overallocate
13324 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13325 /* overallocate to limit the number of realloc() */
13326 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013327 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013328 if (newlen < writer->min_length)
13329 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013330
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013331 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013332 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013333 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013334 newbuffer = PyUnicode_New(newlen, maxchar);
13335 if (newbuffer == NULL)
13336 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13338 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013339 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013340 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013341 }
13342 else {
13343 newbuffer = resize_compact(writer->buffer, newlen);
13344 if (newbuffer == NULL)
13345 return -1;
13346 }
13347 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013348 }
13349 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013350 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013351 newbuffer = PyUnicode_New(writer->size, maxchar);
13352 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013353 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013354 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13355 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013356 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013357 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013358 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013359 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013360
13361#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013362}
13363
Victor Stinnerca9381e2015-09-22 00:58:32 +020013364int
13365_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13366 enum PyUnicode_Kind kind)
13367{
13368 Py_UCS4 maxchar;
13369
13370 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13371 assert(writer->kind < kind);
13372
13373 switch (kind)
13374 {
13375 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13376 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13377 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13378 default:
13379 assert(0 && "invalid kind");
13380 return -1;
13381 }
13382
13383 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13384}
13385
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013386Py_LOCAL_INLINE(int)
13387_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013388{
13389 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13390 return -1;
13391 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13392 writer->pos++;
13393 return 0;
13394}
13395
13396int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013397_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13398{
13399 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13400}
13401
13402int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013403_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13404{
13405 Py_UCS4 maxchar;
13406 Py_ssize_t len;
13407
13408 if (PyUnicode_READY(str) == -1)
13409 return -1;
13410 len = PyUnicode_GET_LENGTH(str);
13411 if (len == 0)
13412 return 0;
13413 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13414 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013415 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013416 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013417 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013418 Py_INCREF(str);
13419 writer->buffer = str;
13420 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013421 writer->pos += len;
13422 return 0;
13423 }
13424 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13425 return -1;
13426 }
13427 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13428 str, 0, len);
13429 writer->pos += len;
13430 return 0;
13431}
13432
Victor Stinnere215d962012-10-06 23:03:36 +020013433int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013434_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13435 Py_ssize_t start, Py_ssize_t end)
13436{
13437 Py_UCS4 maxchar;
13438 Py_ssize_t len;
13439
13440 if (PyUnicode_READY(str) == -1)
13441 return -1;
13442
13443 assert(0 <= start);
13444 assert(end <= PyUnicode_GET_LENGTH(str));
13445 assert(start <= end);
13446
13447 if (end == 0)
13448 return 0;
13449
13450 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13451 return _PyUnicodeWriter_WriteStr(writer, str);
13452
13453 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13454 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13455 else
13456 maxchar = writer->maxchar;
13457 len = end - start;
13458
13459 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13460 return -1;
13461
13462 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13463 str, start, len);
13464 writer->pos += len;
13465 return 0;
13466}
13467
13468int
Victor Stinner4a587072013-11-19 12:54:53 +010013469_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13470 const char *ascii, Py_ssize_t len)
13471{
13472 if (len == -1)
13473 len = strlen(ascii);
13474
13475 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13476
13477 if (writer->buffer == NULL && !writer->overallocate) {
13478 PyObject *str;
13479
13480 str = _PyUnicode_FromASCII(ascii, len);
13481 if (str == NULL)
13482 return -1;
13483
13484 writer->readonly = 1;
13485 writer->buffer = str;
13486 _PyUnicodeWriter_Update(writer);
13487 writer->pos += len;
13488 return 0;
13489 }
13490
13491 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13492 return -1;
13493
13494 switch (writer->kind)
13495 {
13496 case PyUnicode_1BYTE_KIND:
13497 {
13498 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13499 Py_UCS1 *data = writer->data;
13500
13501 Py_MEMCPY(data + writer->pos, str, len);
13502 break;
13503 }
13504 case PyUnicode_2BYTE_KIND:
13505 {
13506 _PyUnicode_CONVERT_BYTES(
13507 Py_UCS1, Py_UCS2,
13508 ascii, ascii + len,
13509 (Py_UCS2 *)writer->data + writer->pos);
13510 break;
13511 }
13512 case PyUnicode_4BYTE_KIND:
13513 {
13514 _PyUnicode_CONVERT_BYTES(
13515 Py_UCS1, Py_UCS4,
13516 ascii, ascii + len,
13517 (Py_UCS4 *)writer->data + writer->pos);
13518 break;
13519 }
13520 default:
13521 assert(0);
13522 }
13523
13524 writer->pos += len;
13525 return 0;
13526}
13527
13528int
13529_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13530 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013531{
13532 Py_UCS4 maxchar;
13533
13534 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13535 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13536 return -1;
13537 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13538 writer->pos += len;
13539 return 0;
13540}
13541
Victor Stinnerd3f08822012-05-29 12:57:52 +020013542PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013543_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013544{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013545 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013547 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013548 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013549 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013550 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013551 str = writer->buffer;
13552 writer->buffer = NULL;
13553 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13554 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013556 if (writer->pos == 0) {
13557 Py_CLEAR(writer->buffer);
13558
13559 /* Get the empty Unicode string singleton ('') */
13560 _Py_INCREF_UNICODE_EMPTY();
13561 str = unicode_empty;
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013563 else {
13564 str = writer->buffer;
13565 writer->buffer = NULL;
13566
13567 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13568 PyObject *str2;
13569 str2 = resize_compact(str, writer->pos);
13570 if (str2 == NULL)
13571 return NULL;
13572 str = str2;
13573 }
13574 }
13575
Victor Stinner15a0bd32013-07-08 22:29:55 +020013576 assert(_PyUnicode_CheckConsistency(str, 1));
13577 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013578}
13579
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013581_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013582{
13583 Py_CLEAR(writer->buffer);
13584}
13585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013586#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013587
13588PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013590\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013591Return a formatted version of S, using substitutions from args and kwargs.\n\
13592The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013593
Eric Smith27bbca62010-11-04 17:06:58 +000013594PyDoc_STRVAR(format_map__doc__,
13595 "S.format_map(mapping) -> str\n\
13596\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013597Return a formatted version of S, using substitutions from mapping.\n\
13598The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013599
Eric Smith4a7d76d2008-05-30 18:10:19 +000013600static PyObject *
13601unicode__format__(PyObject* self, PyObject* args)
13602{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013603 PyObject *format_spec;
13604 _PyUnicodeWriter writer;
13605 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013606
13607 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13608 return NULL;
13609
Victor Stinnerd3f08822012-05-29 12:57:52 +020013610 if (PyUnicode_READY(self) == -1)
13611 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013612 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013613 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13614 self, format_spec, 0,
13615 PyUnicode_GET_LENGTH(format_spec));
13616 if (ret == -1) {
13617 _PyUnicodeWriter_Dealloc(&writer);
13618 return NULL;
13619 }
13620 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013621}
13622
Eric Smith8c663262007-08-25 02:26:07 +000013623PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013625\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013626Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013627
13628static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013629unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 Py_ssize_t size;
13632
13633 /* If it's a compact object, account for base structure +
13634 character data. */
13635 if (PyUnicode_IS_COMPACT_ASCII(v))
13636 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13637 else if (PyUnicode_IS_COMPACT(v))
13638 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013639 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013640 else {
13641 /* If it is a two-block object, account for base object, and
13642 for character block if present. */
13643 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013644 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013646 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 }
13648 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013649 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013650 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013651 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013652 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013653 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654
13655 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013656}
13657
13658PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013659 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013660
13661static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013662unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013663{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013664 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013665 if (!copy)
13666 return NULL;
13667 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013668}
13669
Guido van Rossumd57fd912000-03-10 22:53:23 +000013670static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013671 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013672 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013673 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13674 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013675 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13676 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013677 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013678 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13679 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13680 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013681 {"expandtabs", (PyCFunction) unicode_expandtabs,
13682 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013683 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013684 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013685 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13686 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13687 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013688 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013689 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13690 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13691 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013692 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013693 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013694 {"splitlines", (PyCFunction) unicode_splitlines,
13695 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013696 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013697 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13698 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13699 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13700 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13701 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13702 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13703 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13704 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13705 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13706 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13707 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13708 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13709 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13710 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013711 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013712 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013713 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013714 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013715 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013716 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013717 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013718 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013719#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013720 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013721 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013722#endif
13723
Benjamin Peterson14339b62009-01-31 16:36:08 +000013724 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013725 {NULL, NULL}
13726};
13727
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013728static PyObject *
13729unicode_mod(PyObject *v, PyObject *w)
13730{
Brian Curtindfc80e32011-08-10 20:28:54 -050013731 if (!PyUnicode_Check(v))
13732 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013734}
13735
13736static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013737 0, /*nb_add*/
13738 0, /*nb_subtract*/
13739 0, /*nb_multiply*/
13740 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013741};
13742
Guido van Rossumd57fd912000-03-10 22:53:23 +000013743static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 (lenfunc) unicode_length, /* sq_length */
13745 PyUnicode_Concat, /* sq_concat */
13746 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13747 (ssizeargfunc) unicode_getitem, /* sq_item */
13748 0, /* sq_slice */
13749 0, /* sq_ass_item */
13750 0, /* sq_ass_slice */
13751 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013752};
13753
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013754static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013755unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013757 if (PyUnicode_READY(self) == -1)
13758 return NULL;
13759
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013760 if (PyIndex_Check(item)) {
13761 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013762 if (i == -1 && PyErr_Occurred())
13763 return NULL;
13764 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013765 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013766 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013767 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013768 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013769 PyObject *result;
13770 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013771 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013772 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013774 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013775 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013776 return NULL;
13777 }
13778
13779 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013780 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013782 slicelength == PyUnicode_GET_LENGTH(self)) {
13783 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013784 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013785 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013786 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013787 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013788 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013789 src_kind = PyUnicode_KIND(self);
13790 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013791 if (!PyUnicode_IS_ASCII(self)) {
13792 kind_limit = kind_maxchar_limit(src_kind);
13793 max_char = 0;
13794 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795 ch = PyUnicode_READ(src_kind, src_data, cur);
13796 if (ch > max_char) {
13797 max_char = ch;
13798 if (max_char >= kind_limit)
13799 break;
13800 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013801 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013802 }
Victor Stinner55c99112011-10-13 01:17:06 +020013803 else
13804 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013805 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013806 if (result == NULL)
13807 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013808 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013809 dest_data = PyUnicode_DATA(result);
13810
13811 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013812 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13813 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013814 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013815 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013816 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013817 } else {
13818 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13819 return NULL;
13820 }
13821}
13822
13823static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013824 (lenfunc)unicode_length, /* mp_length */
13825 (binaryfunc)unicode_subscript, /* mp_subscript */
13826 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013827};
13828
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830/* Helpers for PyUnicode_Format() */
13831
Victor Stinnera47082312012-10-04 02:19:54 +020013832struct unicode_formatter_t {
13833 PyObject *args;
13834 int args_owned;
13835 Py_ssize_t arglen, argidx;
13836 PyObject *dict;
13837
13838 enum PyUnicode_Kind fmtkind;
13839 Py_ssize_t fmtcnt, fmtpos;
13840 void *fmtdata;
13841 PyObject *fmtstr;
13842
13843 _PyUnicodeWriter writer;
13844};
13845
13846struct unicode_format_arg_t {
13847 Py_UCS4 ch;
13848 int flags;
13849 Py_ssize_t width;
13850 int prec;
13851 int sign;
13852};
13853
Guido van Rossumd57fd912000-03-10 22:53:23 +000013854static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013855unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856{
Victor Stinnera47082312012-10-04 02:19:54 +020013857 Py_ssize_t argidx = ctx->argidx;
13858
13859 if (argidx < ctx->arglen) {
13860 ctx->argidx++;
13861 if (ctx->arglen < 0)
13862 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013863 else
Victor Stinnera47082312012-10-04 02:19:54 +020013864 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865 }
13866 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013868 return NULL;
13869}
13870
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013871/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872
Victor Stinnera47082312012-10-04 02:19:54 +020013873/* Format a float into the writer if the writer is not NULL, or into *p_output
13874 otherwise.
13875
13876 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013877static int
Victor Stinnera47082312012-10-04 02:19:54 +020013878formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13879 PyObject **p_output,
13880 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013882 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013883 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013884 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013885 int prec;
13886 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013887
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888 x = PyFloat_AsDouble(v);
13889 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013890 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013891
Victor Stinnera47082312012-10-04 02:19:54 +020013892 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013893 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013895
Victor Stinnera47082312012-10-04 02:19:54 +020013896 if (arg->flags & F_ALT)
13897 dtoa_flags = Py_DTSF_ALT;
13898 else
13899 dtoa_flags = 0;
13900 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013901 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013902 return -1;
13903 len = strlen(p);
13904 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013905 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013906 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013907 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013908 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013909 }
13910 else
13911 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013912 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013913 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013914}
13915
Victor Stinnerd0880d52012-04-27 23:40:13 +020013916/* formatlong() emulates the format codes d, u, o, x and X, and
13917 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13918 * Python's regular ints.
13919 * Return value: a new PyUnicodeObject*, or NULL if error.
13920 * The output string is of the form
13921 * "-"? ("0x" | "0X")? digit+
13922 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13923 * set in flags. The case of hex digits will be correct,
13924 * There will be at least prec digits, zero-filled on the left if
13925 * necessary to get that many.
13926 * val object to be converted
13927 * flags bitmask of format flags; only F_ALT is looked at
13928 * prec minimum number of digits; 0-fill on left if needed
13929 * type a character in [duoxX]; u acts the same as d
13930 *
13931 * CAUTION: o, x and X conversions on regular ints can never
13932 * produce a '-' sign, but can for Python's unbounded ints.
13933 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013934PyObject *
13935_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013936{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013937 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013938 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013939 Py_ssize_t i;
13940 int sign; /* 1 if '-', else 0 */
13941 int len; /* number of characters */
13942 Py_ssize_t llen;
13943 int numdigits; /* len == numnondigits + numdigits */
13944 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013945
Victor Stinnerd0880d52012-04-27 23:40:13 +020013946 /* Avoid exceeding SSIZE_T_MAX */
13947 if (prec > INT_MAX-3) {
13948 PyErr_SetString(PyExc_OverflowError,
13949 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013951 }
13952
13953 assert(PyLong_Check(val));
13954
13955 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013956 default:
13957 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013958 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013959 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013960 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013961 /* int and int subclasses should print numerically when a numeric */
13962 /* format code is used (see issue18780) */
13963 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013964 break;
13965 case 'o':
13966 numnondigits = 2;
13967 result = PyNumber_ToBase(val, 8);
13968 break;
13969 case 'x':
13970 case 'X':
13971 numnondigits = 2;
13972 result = PyNumber_ToBase(val, 16);
13973 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013974 }
13975 if (!result)
13976 return NULL;
13977
13978 assert(unicode_modifiable(result));
13979 assert(PyUnicode_IS_READY(result));
13980 assert(PyUnicode_IS_ASCII(result));
13981
13982 /* To modify the string in-place, there can only be one reference. */
13983 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013984 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013985 PyErr_BadInternalCall();
13986 return NULL;
13987 }
13988 buf = PyUnicode_DATA(result);
13989 llen = PyUnicode_GET_LENGTH(result);
13990 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013991 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013992 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013993 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013994 return NULL;
13995 }
13996 len = (int)llen;
13997 sign = buf[0] == '-';
13998 numnondigits += sign;
13999 numdigits = len - numnondigits;
14000 assert(numdigits > 0);
14001
14002 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014003 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014004 (type == 'o' || type == 'x' || type == 'X'))) {
14005 assert(buf[sign] == '0');
14006 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14007 buf[sign+1] == 'o');
14008 numnondigits -= 2;
14009 buf += 2;
14010 len -= 2;
14011 if (sign)
14012 buf[0] = '-';
14013 assert(len == numnondigits + numdigits);
14014 assert(numdigits > 0);
14015 }
14016
14017 /* Fill with leading zeroes to meet minimum width. */
14018 if (prec > numdigits) {
14019 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14020 numnondigits + prec);
14021 char *b1;
14022 if (!r1) {
14023 Py_DECREF(result);
14024 return NULL;
14025 }
14026 b1 = PyBytes_AS_STRING(r1);
14027 for (i = 0; i < numnondigits; ++i)
14028 *b1++ = *buf++;
14029 for (i = 0; i < prec - numdigits; i++)
14030 *b1++ = '0';
14031 for (i = 0; i < numdigits; i++)
14032 *b1++ = *buf++;
14033 *b1 = '\0';
14034 Py_DECREF(result);
14035 result = r1;
14036 buf = PyBytes_AS_STRING(result);
14037 len = numnondigits + prec;
14038 }
14039
14040 /* Fix up case for hex conversions. */
14041 if (type == 'X') {
14042 /* Need to convert all lower case letters to upper case.
14043 and need to convert 0x to 0X (and -0x to -0X). */
14044 for (i = 0; i < len; i++)
14045 if (buf[i] >= 'a' && buf[i] <= 'x')
14046 buf[i] -= 'a'-'A';
14047 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014048 if (!PyUnicode_Check(result)
14049 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014050 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014051 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014052 Py_DECREF(result);
14053 result = unicode;
14054 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014055 else if (len != PyUnicode_GET_LENGTH(result)) {
14056 if (PyUnicode_Resize(&result, len) < 0)
14057 Py_CLEAR(result);
14058 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014060}
14061
Ethan Furmandf3ed242014-01-05 06:50:30 -080014062/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014063 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014064 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014065 * -1 and raise an exception on error */
14066static int
Victor Stinnera47082312012-10-04 02:19:54 +020014067mainformatlong(PyObject *v,
14068 struct unicode_format_arg_t *arg,
14069 PyObject **p_output,
14070 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014071{
14072 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014073 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014074
14075 if (!PyNumber_Check(v))
14076 goto wrongtype;
14077
Ethan Furman9ab74802014-03-21 06:38:46 -070014078 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014079 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014080 if (type == 'o' || type == 'x' || type == 'X') {
14081 iobj = PyNumber_Index(v);
14082 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014083 if (PyErr_ExceptionMatches(PyExc_TypeError))
14084 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014085 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014086 }
14087 }
14088 else {
14089 iobj = PyNumber_Long(v);
14090 if (iobj == NULL ) {
14091 if (PyErr_ExceptionMatches(PyExc_TypeError))
14092 goto wrongtype;
14093 return -1;
14094 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014095 }
14096 assert(PyLong_Check(iobj));
14097 }
14098 else {
14099 iobj = v;
14100 Py_INCREF(iobj);
14101 }
14102
14103 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014104 && arg->width == -1 && arg->prec == -1
14105 && !(arg->flags & (F_SIGN | F_BLANK))
14106 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014107 {
14108 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014109 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014110 int base;
14111
Victor Stinnera47082312012-10-04 02:19:54 +020014112 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014113 {
14114 default:
14115 assert(0 && "'type' not in [diuoxX]");
14116 case 'd':
14117 case 'i':
14118 case 'u':
14119 base = 10;
14120 break;
14121 case 'o':
14122 base = 8;
14123 break;
14124 case 'x':
14125 case 'X':
14126 base = 16;
14127 break;
14128 }
14129
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014130 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14131 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014132 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014133 }
14134 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014135 return 1;
14136 }
14137
Ethan Furmanb95b5612015-01-23 20:05:18 -080014138 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014139 Py_DECREF(iobj);
14140 if (res == NULL)
14141 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014142 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014143 return 0;
14144
14145wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014146 switch(type)
14147 {
14148 case 'o':
14149 case 'x':
14150 case 'X':
14151 PyErr_Format(PyExc_TypeError,
14152 "%%%c format: an integer is required, "
14153 "not %.200s",
14154 type, Py_TYPE(v)->tp_name);
14155 break;
14156 default:
14157 PyErr_Format(PyExc_TypeError,
14158 "%%%c format: a number is required, "
14159 "not %.200s",
14160 type, Py_TYPE(v)->tp_name);
14161 break;
14162 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014163 return -1;
14164}
14165
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014166static Py_UCS4
14167formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014168{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014169 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014170 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014171 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014172 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014173 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 goto onError;
14175 }
14176 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014177 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014178 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014179 /* make sure number is a type of integer */
14180 if (!PyLong_Check(v)) {
14181 iobj = PyNumber_Index(v);
14182 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014183 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014184 }
14185 v = iobj;
14186 Py_DECREF(iobj);
14187 }
14188 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014189 x = PyLong_AsLong(v);
14190 if (x == -1 && PyErr_Occurred())
14191 goto onError;
14192
Victor Stinner8faf8212011-12-08 22:14:11 +010014193 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014194 PyErr_SetString(PyExc_OverflowError,
14195 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014196 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014197 }
14198
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014199 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014200 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014201
Benjamin Peterson29060642009-01-31 22:14:21 +000014202 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014203 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014204 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014205 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206}
14207
Victor Stinnera47082312012-10-04 02:19:54 +020014208/* Parse options of an argument: flags, width, precision.
14209 Handle also "%(name)" syntax.
14210
14211 Return 0 if the argument has been formatted into arg->str.
14212 Return 1 if the argument has been written into ctx->writer,
14213 Raise an exception and return -1 on error. */
14214static int
14215unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14216 struct unicode_format_arg_t *arg)
14217{
14218#define FORMAT_READ(ctx) \
14219 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14220
14221 PyObject *v;
14222
Victor Stinnera47082312012-10-04 02:19:54 +020014223 if (arg->ch == '(') {
14224 /* Get argument value from a dictionary. Example: "%(name)s". */
14225 Py_ssize_t keystart;
14226 Py_ssize_t keylen;
14227 PyObject *key;
14228 int pcount = 1;
14229
14230 if (ctx->dict == NULL) {
14231 PyErr_SetString(PyExc_TypeError,
14232 "format requires a mapping");
14233 return -1;
14234 }
14235 ++ctx->fmtpos;
14236 --ctx->fmtcnt;
14237 keystart = ctx->fmtpos;
14238 /* Skip over balanced parentheses */
14239 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14240 arg->ch = FORMAT_READ(ctx);
14241 if (arg->ch == ')')
14242 --pcount;
14243 else if (arg->ch == '(')
14244 ++pcount;
14245 ctx->fmtpos++;
14246 }
14247 keylen = ctx->fmtpos - keystart - 1;
14248 if (ctx->fmtcnt < 0 || pcount > 0) {
14249 PyErr_SetString(PyExc_ValueError,
14250 "incomplete format key");
14251 return -1;
14252 }
14253 key = PyUnicode_Substring(ctx->fmtstr,
14254 keystart, keystart + keylen);
14255 if (key == NULL)
14256 return -1;
14257 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014258 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014259 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014260 }
14261 ctx->args = PyObject_GetItem(ctx->dict, key);
14262 Py_DECREF(key);
14263 if (ctx->args == NULL)
14264 return -1;
14265 ctx->args_owned = 1;
14266 ctx->arglen = -1;
14267 ctx->argidx = -2;
14268 }
14269
14270 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014271 while (--ctx->fmtcnt >= 0) {
14272 arg->ch = FORMAT_READ(ctx);
14273 ctx->fmtpos++;
14274 switch (arg->ch) {
14275 case '-': arg->flags |= F_LJUST; continue;
14276 case '+': arg->flags |= F_SIGN; continue;
14277 case ' ': arg->flags |= F_BLANK; continue;
14278 case '#': arg->flags |= F_ALT; continue;
14279 case '0': arg->flags |= F_ZERO; continue;
14280 }
14281 break;
14282 }
14283
14284 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014285 if (arg->ch == '*') {
14286 v = unicode_format_getnextarg(ctx);
14287 if (v == NULL)
14288 return -1;
14289 if (!PyLong_Check(v)) {
14290 PyErr_SetString(PyExc_TypeError,
14291 "* wants int");
14292 return -1;
14293 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014294 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014295 if (arg->width == -1 && PyErr_Occurred())
14296 return -1;
14297 if (arg->width < 0) {
14298 arg->flags |= F_LJUST;
14299 arg->width = -arg->width;
14300 }
14301 if (--ctx->fmtcnt >= 0) {
14302 arg->ch = FORMAT_READ(ctx);
14303 ctx->fmtpos++;
14304 }
14305 }
14306 else if (arg->ch >= '0' && arg->ch <= '9') {
14307 arg->width = arg->ch - '0';
14308 while (--ctx->fmtcnt >= 0) {
14309 arg->ch = FORMAT_READ(ctx);
14310 ctx->fmtpos++;
14311 if (arg->ch < '0' || arg->ch > '9')
14312 break;
14313 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14314 mixing signed and unsigned comparison. Since arg->ch is between
14315 '0' and '9', casting to int is safe. */
14316 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14317 PyErr_SetString(PyExc_ValueError,
14318 "width too big");
14319 return -1;
14320 }
14321 arg->width = arg->width*10 + (arg->ch - '0');
14322 }
14323 }
14324
14325 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014326 if (arg->ch == '.') {
14327 arg->prec = 0;
14328 if (--ctx->fmtcnt >= 0) {
14329 arg->ch = FORMAT_READ(ctx);
14330 ctx->fmtpos++;
14331 }
14332 if (arg->ch == '*') {
14333 v = unicode_format_getnextarg(ctx);
14334 if (v == NULL)
14335 return -1;
14336 if (!PyLong_Check(v)) {
14337 PyErr_SetString(PyExc_TypeError,
14338 "* wants int");
14339 return -1;
14340 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014341 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014342 if (arg->prec == -1 && PyErr_Occurred())
14343 return -1;
14344 if (arg->prec < 0)
14345 arg->prec = 0;
14346 if (--ctx->fmtcnt >= 0) {
14347 arg->ch = FORMAT_READ(ctx);
14348 ctx->fmtpos++;
14349 }
14350 }
14351 else if (arg->ch >= '0' && arg->ch <= '9') {
14352 arg->prec = arg->ch - '0';
14353 while (--ctx->fmtcnt >= 0) {
14354 arg->ch = FORMAT_READ(ctx);
14355 ctx->fmtpos++;
14356 if (arg->ch < '0' || arg->ch > '9')
14357 break;
14358 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14359 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014360 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014361 return -1;
14362 }
14363 arg->prec = arg->prec*10 + (arg->ch - '0');
14364 }
14365 }
14366 }
14367
14368 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14369 if (ctx->fmtcnt >= 0) {
14370 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14371 if (--ctx->fmtcnt >= 0) {
14372 arg->ch = FORMAT_READ(ctx);
14373 ctx->fmtpos++;
14374 }
14375 }
14376 }
14377 if (ctx->fmtcnt < 0) {
14378 PyErr_SetString(PyExc_ValueError,
14379 "incomplete format");
14380 return -1;
14381 }
14382 return 0;
14383
14384#undef FORMAT_READ
14385}
14386
14387/* Format one argument. Supported conversion specifiers:
14388
14389 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014390 - "i", "d", "u": int or float
14391 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014392 - "e", "E", "f", "F", "g", "G": float
14393 - "c": int or str (1 character)
14394
Victor Stinner8dbd4212012-12-04 09:30:24 +010014395 When possible, the output is written directly into the Unicode writer
14396 (ctx->writer). A string is created when padding is required.
14397
Victor Stinnera47082312012-10-04 02:19:54 +020014398 Return 0 if the argument has been formatted into *p_str,
14399 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014400 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014401static int
14402unicode_format_arg_format(struct unicode_formatter_t *ctx,
14403 struct unicode_format_arg_t *arg,
14404 PyObject **p_str)
14405{
14406 PyObject *v;
14407 _PyUnicodeWriter *writer = &ctx->writer;
14408
14409 if (ctx->fmtcnt == 0)
14410 ctx->writer.overallocate = 0;
14411
14412 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014413 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014414 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014415 return 1;
14416 }
14417
14418 v = unicode_format_getnextarg(ctx);
14419 if (v == NULL)
14420 return -1;
14421
Victor Stinnera47082312012-10-04 02:19:54 +020014422
14423 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014424 case 's':
14425 case 'r':
14426 case 'a':
14427 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14428 /* Fast path */
14429 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14430 return -1;
14431 return 1;
14432 }
14433
14434 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14435 *p_str = v;
14436 Py_INCREF(*p_str);
14437 }
14438 else {
14439 if (arg->ch == 's')
14440 *p_str = PyObject_Str(v);
14441 else if (arg->ch == 'r')
14442 *p_str = PyObject_Repr(v);
14443 else
14444 *p_str = PyObject_ASCII(v);
14445 }
14446 break;
14447
14448 case 'i':
14449 case 'd':
14450 case 'u':
14451 case 'o':
14452 case 'x':
14453 case 'X':
14454 {
14455 int ret = mainformatlong(v, arg, p_str, writer);
14456 if (ret != 0)
14457 return ret;
14458 arg->sign = 1;
14459 break;
14460 }
14461
14462 case 'e':
14463 case 'E':
14464 case 'f':
14465 case 'F':
14466 case 'g':
14467 case 'G':
14468 if (arg->width == -1 && arg->prec == -1
14469 && !(arg->flags & (F_SIGN | F_BLANK)))
14470 {
14471 /* Fast path */
14472 if (formatfloat(v, arg, NULL, writer) == -1)
14473 return -1;
14474 return 1;
14475 }
14476
14477 arg->sign = 1;
14478 if (formatfloat(v, arg, p_str, NULL) == -1)
14479 return -1;
14480 break;
14481
14482 case 'c':
14483 {
14484 Py_UCS4 ch = formatchar(v);
14485 if (ch == (Py_UCS4) -1)
14486 return -1;
14487 if (arg->width == -1 && arg->prec == -1) {
14488 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014489 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014490 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014491 return 1;
14492 }
14493 *p_str = PyUnicode_FromOrdinal(ch);
14494 break;
14495 }
14496
14497 default:
14498 PyErr_Format(PyExc_ValueError,
14499 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014500 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014501 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14502 (int)arg->ch,
14503 ctx->fmtpos - 1);
14504 return -1;
14505 }
14506 if (*p_str == NULL)
14507 return -1;
14508 assert (PyUnicode_Check(*p_str));
14509 return 0;
14510}
14511
14512static int
14513unicode_format_arg_output(struct unicode_formatter_t *ctx,
14514 struct unicode_format_arg_t *arg,
14515 PyObject *str)
14516{
14517 Py_ssize_t len;
14518 enum PyUnicode_Kind kind;
14519 void *pbuf;
14520 Py_ssize_t pindex;
14521 Py_UCS4 signchar;
14522 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014523 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014524 Py_ssize_t sublen;
14525 _PyUnicodeWriter *writer = &ctx->writer;
14526 Py_UCS4 fill;
14527
14528 fill = ' ';
14529 if (arg->sign && arg->flags & F_ZERO)
14530 fill = '0';
14531
14532 if (PyUnicode_READY(str) == -1)
14533 return -1;
14534
14535 len = PyUnicode_GET_LENGTH(str);
14536 if ((arg->width == -1 || arg->width <= len)
14537 && (arg->prec == -1 || arg->prec >= len)
14538 && !(arg->flags & (F_SIGN | F_BLANK)))
14539 {
14540 /* Fast path */
14541 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14542 return -1;
14543 return 0;
14544 }
14545
14546 /* Truncate the string for "s", "r" and "a" formats
14547 if the precision is set */
14548 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14549 if (arg->prec >= 0 && len > arg->prec)
14550 len = arg->prec;
14551 }
14552
14553 /* Adjust sign and width */
14554 kind = PyUnicode_KIND(str);
14555 pbuf = PyUnicode_DATA(str);
14556 pindex = 0;
14557 signchar = '\0';
14558 if (arg->sign) {
14559 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14560 if (ch == '-' || ch == '+') {
14561 signchar = ch;
14562 len--;
14563 pindex++;
14564 }
14565 else if (arg->flags & F_SIGN)
14566 signchar = '+';
14567 else if (arg->flags & F_BLANK)
14568 signchar = ' ';
14569 else
14570 arg->sign = 0;
14571 }
14572 if (arg->width < len)
14573 arg->width = len;
14574
14575 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014576 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014577 if (!(arg->flags & F_LJUST)) {
14578 if (arg->sign) {
14579 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014580 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014581 }
14582 else {
14583 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014584 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014585 }
14586 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014587 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14588 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014589 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014590 }
14591
Victor Stinnera47082312012-10-04 02:19:54 +020014592 buflen = arg->width;
14593 if (arg->sign && len == arg->width)
14594 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014595 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014596 return -1;
14597
14598 /* Write the sign if needed */
14599 if (arg->sign) {
14600 if (fill != ' ') {
14601 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14602 writer->pos += 1;
14603 }
14604 if (arg->width > len)
14605 arg->width--;
14606 }
14607
14608 /* Write the numeric prefix for "x", "X" and "o" formats
14609 if the alternate form is used.
14610 For example, write "0x" for the "%#x" format. */
14611 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14612 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14613 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14614 if (fill != ' ') {
14615 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14616 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14617 writer->pos += 2;
14618 pindex += 2;
14619 }
14620 arg->width -= 2;
14621 if (arg->width < 0)
14622 arg->width = 0;
14623 len -= 2;
14624 }
14625
14626 /* Pad left with the fill character if needed */
14627 if (arg->width > len && !(arg->flags & F_LJUST)) {
14628 sublen = arg->width - len;
14629 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14630 writer->pos += sublen;
14631 arg->width = len;
14632 }
14633
14634 /* If padding with spaces: write sign if needed and/or numeric prefix if
14635 the alternate form is used */
14636 if (fill == ' ') {
14637 if (arg->sign) {
14638 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14639 writer->pos += 1;
14640 }
14641 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14642 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14643 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14644 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14645 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14646 writer->pos += 2;
14647 pindex += 2;
14648 }
14649 }
14650
14651 /* Write characters */
14652 if (len) {
14653 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14654 str, pindex, len);
14655 writer->pos += len;
14656 }
14657
14658 /* Pad right with the fill character if needed */
14659 if (arg->width > len) {
14660 sublen = arg->width - len;
14661 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14662 writer->pos += sublen;
14663 }
14664 return 0;
14665}
14666
14667/* Helper of PyUnicode_Format(): format one arg.
14668 Return 0 on success, raise an exception and return -1 on error. */
14669static int
14670unicode_format_arg(struct unicode_formatter_t *ctx)
14671{
14672 struct unicode_format_arg_t arg;
14673 PyObject *str;
14674 int ret;
14675
Victor Stinner8dbd4212012-12-04 09:30:24 +010014676 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14677 arg.flags = 0;
14678 arg.width = -1;
14679 arg.prec = -1;
14680 arg.sign = 0;
14681 str = NULL;
14682
Victor Stinnera47082312012-10-04 02:19:54 +020014683 ret = unicode_format_arg_parse(ctx, &arg);
14684 if (ret == -1)
14685 return -1;
14686
14687 ret = unicode_format_arg_format(ctx, &arg, &str);
14688 if (ret == -1)
14689 return -1;
14690
14691 if (ret != 1) {
14692 ret = unicode_format_arg_output(ctx, &arg, str);
14693 Py_DECREF(str);
14694 if (ret == -1)
14695 return -1;
14696 }
14697
14698 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14699 PyErr_SetString(PyExc_TypeError,
14700 "not all arguments converted during string formatting");
14701 return -1;
14702 }
14703 return 0;
14704}
14705
Alexander Belopolsky40018472011-02-26 01:02:56 +000014706PyObject *
14707PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014708{
Victor Stinnera47082312012-10-04 02:19:54 +020014709 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014710
Guido van Rossumd57fd912000-03-10 22:53:23 +000014711 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014712 PyErr_BadInternalCall();
14713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014714 }
Victor Stinnera47082312012-10-04 02:19:54 +020014715
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014716 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014717 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014718
14719 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014720 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14721 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14722 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14723 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014724
Victor Stinner8f674cc2013-04-17 23:02:17 +020014725 _PyUnicodeWriter_Init(&ctx.writer);
14726 ctx.writer.min_length = ctx.fmtcnt + 100;
14727 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014728
Guido van Rossumd57fd912000-03-10 22:53:23 +000014729 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014730 ctx.arglen = PyTuple_Size(args);
14731 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014732 }
14733 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014734 ctx.arglen = -1;
14735 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014736 }
Victor Stinnera47082312012-10-04 02:19:54 +020014737 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014738 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014739 ctx.dict = args;
14740 else
14741 ctx.dict = NULL;
14742 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014743
Victor Stinnera47082312012-10-04 02:19:54 +020014744 while (--ctx.fmtcnt >= 0) {
14745 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014746 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014747
14748 nonfmtpos = ctx.fmtpos++;
14749 while (ctx.fmtcnt >= 0 &&
14750 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14751 ctx.fmtpos++;
14752 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014753 }
Victor Stinnera47082312012-10-04 02:19:54 +020014754 if (ctx.fmtcnt < 0) {
14755 ctx.fmtpos--;
14756 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014757 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014758
Victor Stinnercfc4c132013-04-03 01:48:39 +020014759 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14760 nonfmtpos, ctx.fmtpos) < 0)
14761 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014762 }
14763 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014764 ctx.fmtpos++;
14765 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014766 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014767 }
14768 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014769
Victor Stinnera47082312012-10-04 02:19:54 +020014770 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014771 PyErr_SetString(PyExc_TypeError,
14772 "not all arguments converted during string formatting");
14773 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014774 }
14775
Victor Stinnera47082312012-10-04 02:19:54 +020014776 if (ctx.args_owned) {
14777 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014778 }
Victor Stinnera47082312012-10-04 02:19:54 +020014779 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014780
Benjamin Peterson29060642009-01-31 22:14:21 +000014781 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014782 _PyUnicodeWriter_Dealloc(&ctx.writer);
14783 if (ctx.args_owned) {
14784 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014785 }
14786 return NULL;
14787}
14788
Jeremy Hylton938ace62002-07-17 16:30:39 +000014789static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014790unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14791
Tim Peters6d6c1a32001-08-02 04:15:00 +000014792static PyObject *
14793unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14794{
Benjamin Peterson29060642009-01-31 22:14:21 +000014795 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014796 static char *kwlist[] = {"object", "encoding", "errors", 0};
14797 char *encoding = NULL;
14798 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014799
Benjamin Peterson14339b62009-01-31 16:36:08 +000014800 if (type != &PyUnicode_Type)
14801 return unicode_subtype_new(type, args, kwds);
14802 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014803 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014804 return NULL;
14805 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014806 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 if (encoding == NULL && errors == NULL)
14808 return PyObject_Str(x);
14809 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014810 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014811}
14812
Guido van Rossume023fe02001-08-30 03:12:59 +000014813static PyObject *
14814unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14815{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014816 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014817 Py_ssize_t length, char_size;
14818 int share_wstr, share_utf8;
14819 unsigned int kind;
14820 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014821
Benjamin Peterson14339b62009-01-31 16:36:08 +000014822 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014823
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014824 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014825 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014826 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014827 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014828 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014829 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014830 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014831 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014832
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014833 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014834 if (self == NULL) {
14835 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014836 return NULL;
14837 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014838 kind = PyUnicode_KIND(unicode);
14839 length = PyUnicode_GET_LENGTH(unicode);
14840
14841 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014842#ifdef Py_DEBUG
14843 _PyUnicode_HASH(self) = -1;
14844#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014845 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014846#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014847 _PyUnicode_STATE(self).interned = 0;
14848 _PyUnicode_STATE(self).kind = kind;
14849 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014850 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014851 _PyUnicode_STATE(self).ready = 1;
14852 _PyUnicode_WSTR(self) = NULL;
14853 _PyUnicode_UTF8_LENGTH(self) = 0;
14854 _PyUnicode_UTF8(self) = NULL;
14855 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014856 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014857
14858 share_utf8 = 0;
14859 share_wstr = 0;
14860 if (kind == PyUnicode_1BYTE_KIND) {
14861 char_size = 1;
14862 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14863 share_utf8 = 1;
14864 }
14865 else if (kind == PyUnicode_2BYTE_KIND) {
14866 char_size = 2;
14867 if (sizeof(wchar_t) == 2)
14868 share_wstr = 1;
14869 }
14870 else {
14871 assert(kind == PyUnicode_4BYTE_KIND);
14872 char_size = 4;
14873 if (sizeof(wchar_t) == 4)
14874 share_wstr = 1;
14875 }
14876
14877 /* Ensure we won't overflow the length. */
14878 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14879 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014880 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014881 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014882 data = PyObject_MALLOC((length + 1) * char_size);
14883 if (data == NULL) {
14884 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014885 goto onError;
14886 }
14887
Victor Stinnerc3c74152011-10-02 20:39:55 +020014888 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014889 if (share_utf8) {
14890 _PyUnicode_UTF8_LENGTH(self) = length;
14891 _PyUnicode_UTF8(self) = data;
14892 }
14893 if (share_wstr) {
14894 _PyUnicode_WSTR_LENGTH(self) = length;
14895 _PyUnicode_WSTR(self) = (wchar_t *)data;
14896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014897
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014898 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014899 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014900 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014901#ifdef Py_DEBUG
14902 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14903#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014904 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014905 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014906
14907onError:
14908 Py_DECREF(unicode);
14909 Py_DECREF(self);
14910 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014911}
14912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014913PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014914"str(object='') -> str\n\
14915str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014916\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014917Create a new string object from the given object. If encoding or\n\
14918errors is specified, then the object must expose a data buffer\n\
14919that will be decoded using the given encoding and error handler.\n\
14920Otherwise, returns the result of object.__str__() (if defined)\n\
14921or repr(object).\n\
14922encoding defaults to sys.getdefaultencoding().\n\
14923errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014924
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014925static PyObject *unicode_iter(PyObject *seq);
14926
Guido van Rossumd57fd912000-03-10 22:53:23 +000014927PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014928 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014929 "str", /* tp_name */
14930 sizeof(PyUnicodeObject), /* tp_size */
14931 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014932 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014933 (destructor)unicode_dealloc, /* tp_dealloc */
14934 0, /* tp_print */
14935 0, /* tp_getattr */
14936 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014937 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014938 unicode_repr, /* tp_repr */
14939 &unicode_as_number, /* tp_as_number */
14940 &unicode_as_sequence, /* tp_as_sequence */
14941 &unicode_as_mapping, /* tp_as_mapping */
14942 (hashfunc) unicode_hash, /* tp_hash*/
14943 0, /* tp_call*/
14944 (reprfunc) unicode_str, /* tp_str */
14945 PyObject_GenericGetAttr, /* tp_getattro */
14946 0, /* tp_setattro */
14947 0, /* tp_as_buffer */
14948 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014949 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014950 unicode_doc, /* tp_doc */
14951 0, /* tp_traverse */
14952 0, /* tp_clear */
14953 PyUnicode_RichCompare, /* tp_richcompare */
14954 0, /* tp_weaklistoffset */
14955 unicode_iter, /* tp_iter */
14956 0, /* tp_iternext */
14957 unicode_methods, /* tp_methods */
14958 0, /* tp_members */
14959 0, /* tp_getset */
14960 &PyBaseObject_Type, /* tp_base */
14961 0, /* tp_dict */
14962 0, /* tp_descr_get */
14963 0, /* tp_descr_set */
14964 0, /* tp_dictoffset */
14965 0, /* tp_init */
14966 0, /* tp_alloc */
14967 unicode_new, /* tp_new */
14968 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014969};
14970
14971/* Initialize the Unicode implementation */
14972
Victor Stinner3a50e702011-10-18 21:21:00 +020014973int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014975 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014976 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014977 0x000A, /* LINE FEED */
14978 0x000D, /* CARRIAGE RETURN */
14979 0x001C, /* FILE SEPARATOR */
14980 0x001D, /* GROUP SEPARATOR */
14981 0x001E, /* RECORD SEPARATOR */
14982 0x0085, /* NEXT LINE */
14983 0x2028, /* LINE SEPARATOR */
14984 0x2029, /* PARAGRAPH SEPARATOR */
14985 };
14986
Fred Drakee4315f52000-05-09 19:53:39 +000014987 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014988 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014989 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014990 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014991 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014992
Guido van Rossumcacfc072002-05-24 19:01:59 +000014993 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014994 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014995
14996 /* initialize the linebreak bloom filter */
14997 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014998 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014999 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015000
Christian Heimes26532f72013-07-20 14:57:16 +020015001 if (PyType_Ready(&EncodingMapType) < 0)
15002 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015003
Benjamin Petersonc4311282012-10-30 23:21:10 -040015004 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15005 Py_FatalError("Can't initialize field name iterator type");
15006
15007 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15008 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015009
Victor Stinner3a50e702011-10-18 21:21:00 +020015010 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015011}
15012
15013/* Finalize the Unicode implementation */
15014
Christian Heimesa156e092008-02-16 07:38:31 +000015015int
15016PyUnicode_ClearFreeList(void)
15017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015018 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015019}
15020
Guido van Rossumd57fd912000-03-10 22:53:23 +000015021void
Thomas Wouters78890102000-07-22 19:25:51 +000015022_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015024 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015025
Serhiy Storchaka05997252013-01-26 12:14:02 +020015026 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015027
Serhiy Storchaka05997252013-01-26 12:14:02 +020015028 for (i = 0; i < 256; i++)
15029 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015030 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015031 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015032}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015033
Walter Dörwald16807132007-05-25 13:52:07 +000015034void
15035PyUnicode_InternInPlace(PyObject **p)
15036{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015037 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015039#ifdef Py_DEBUG
15040 assert(s != NULL);
15041 assert(_PyUnicode_CHECK(s));
15042#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015043 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015044 return;
15045#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 /* If it's a subclass, we don't really know what putting
15047 it in the interned dict might do. */
15048 if (!PyUnicode_CheckExact(s))
15049 return;
15050 if (PyUnicode_CHECK_INTERNED(s))
15051 return;
15052 if (interned == NULL) {
15053 interned = PyDict_New();
15054 if (interned == NULL) {
15055 PyErr_Clear(); /* Don't leave an exception */
15056 return;
15057 }
15058 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015059 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015060 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015061 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015062 if (t == NULL) {
15063 PyErr_Clear();
15064 return;
15065 }
15066 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015067 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015068 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015069 return;
15070 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015071 /* The two references in interned are not counted by refcnt.
15072 The deallocator will take care of this */
15073 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015074 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015075}
15076
15077void
15078PyUnicode_InternImmortal(PyObject **p)
15079{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 PyUnicode_InternInPlace(p);
15081 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015082 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 Py_INCREF(*p);
15084 }
Walter Dörwald16807132007-05-25 13:52:07 +000015085}
15086
15087PyObject *
15088PyUnicode_InternFromString(const char *cp)
15089{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015090 PyObject *s = PyUnicode_FromString(cp);
15091 if (s == NULL)
15092 return NULL;
15093 PyUnicode_InternInPlace(&s);
15094 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015095}
15096
Alexander Belopolsky40018472011-02-26 01:02:56 +000015097void
15098_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015099{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015101 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015102 Py_ssize_t i, n;
15103 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015104
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 if (interned == NULL || !PyDict_Check(interned))
15106 return;
15107 keys = PyDict_Keys(interned);
15108 if (keys == NULL || !PyList_Check(keys)) {
15109 PyErr_Clear();
15110 return;
15111 }
Walter Dörwald16807132007-05-25 13:52:07 +000015112
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15114 detector, interned unicode strings are not forcibly deallocated;
15115 rather, we give them their stolen references back, and then clear
15116 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015117
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 n = PyList_GET_SIZE(keys);
15119 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015120 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015122 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015123 if (PyUnicode_READY(s) == -1) {
15124 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015125 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015127 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015128 case SSTATE_NOT_INTERNED:
15129 /* XXX Shouldn't happen */
15130 break;
15131 case SSTATE_INTERNED_IMMORTAL:
15132 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015133 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 break;
15135 case SSTATE_INTERNED_MORTAL:
15136 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015137 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 break;
15139 default:
15140 Py_FatalError("Inconsistent interned string state.");
15141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 }
15144 fprintf(stderr, "total size of all interned strings: "
15145 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15146 "mortal/immortal\n", mortal_size, immortal_size);
15147 Py_DECREF(keys);
15148 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015149 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015150}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015151
15152
15153/********************* Unicode Iterator **************************/
15154
15155typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 PyObject_HEAD
15157 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015158 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015159} unicodeiterobject;
15160
15161static void
15162unicodeiter_dealloc(unicodeiterobject *it)
15163{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015164 _PyObject_GC_UNTRACK(it);
15165 Py_XDECREF(it->it_seq);
15166 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015167}
15168
15169static int
15170unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15171{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 Py_VISIT(it->it_seq);
15173 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174}
15175
15176static PyObject *
15177unicodeiter_next(unicodeiterobject *it)
15178{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015179 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015180
Benjamin Peterson14339b62009-01-31 16:36:08 +000015181 assert(it != NULL);
15182 seq = it->it_seq;
15183 if (seq == NULL)
15184 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015185 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015187 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15188 int kind = PyUnicode_KIND(seq);
15189 void *data = PyUnicode_DATA(seq);
15190 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15191 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 if (item != NULL)
15193 ++it->it_index;
15194 return item;
15195 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015196
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015198 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015199 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015200}
15201
15202static PyObject *
15203unicodeiter_len(unicodeiterobject *it)
15204{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015205 Py_ssize_t len = 0;
15206 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015207 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015209}
15210
15211PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15212
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015213static PyObject *
15214unicodeiter_reduce(unicodeiterobject *it)
15215{
15216 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015217 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015218 it->it_seq, it->it_index);
15219 } else {
15220 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15221 if (u == NULL)
15222 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015223 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015224 }
15225}
15226
15227PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15228
15229static PyObject *
15230unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15231{
15232 Py_ssize_t index = PyLong_AsSsize_t(state);
15233 if (index == -1 && PyErr_Occurred())
15234 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015235 if (it->it_seq != NULL) {
15236 if (index < 0)
15237 index = 0;
15238 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15239 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15240 it->it_index = index;
15241 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015242 Py_RETURN_NONE;
15243}
15244
15245PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15246
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015247static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015249 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015250 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15251 reduce_doc},
15252 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15253 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015255};
15256
15257PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015258 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15259 "str_iterator", /* tp_name */
15260 sizeof(unicodeiterobject), /* tp_basicsize */
15261 0, /* tp_itemsize */
15262 /* methods */
15263 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15264 0, /* tp_print */
15265 0, /* tp_getattr */
15266 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015267 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 0, /* tp_repr */
15269 0, /* tp_as_number */
15270 0, /* tp_as_sequence */
15271 0, /* tp_as_mapping */
15272 0, /* tp_hash */
15273 0, /* tp_call */
15274 0, /* tp_str */
15275 PyObject_GenericGetAttr, /* tp_getattro */
15276 0, /* tp_setattro */
15277 0, /* tp_as_buffer */
15278 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15279 0, /* tp_doc */
15280 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15281 0, /* tp_clear */
15282 0, /* tp_richcompare */
15283 0, /* tp_weaklistoffset */
15284 PyObject_SelfIter, /* tp_iter */
15285 (iternextfunc)unicodeiter_next, /* tp_iternext */
15286 unicodeiter_methods, /* tp_methods */
15287 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015288};
15289
15290static PyObject *
15291unicode_iter(PyObject *seq)
15292{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015294
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 if (!PyUnicode_Check(seq)) {
15296 PyErr_BadInternalCall();
15297 return NULL;
15298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015299 if (PyUnicode_READY(seq) == -1)
15300 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15302 if (it == NULL)
15303 return NULL;
15304 it->it_index = 0;
15305 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015306 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 _PyObject_GC_TRACK(it);
15308 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015309}
15310
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015311
15312size_t
15313Py_UNICODE_strlen(const Py_UNICODE *u)
15314{
15315 int res = 0;
15316 while(*u++)
15317 res++;
15318 return res;
15319}
15320
15321Py_UNICODE*
15322Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15323{
15324 Py_UNICODE *u = s1;
15325 while ((*u++ = *s2++));
15326 return s1;
15327}
15328
15329Py_UNICODE*
15330Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15331{
15332 Py_UNICODE *u = s1;
15333 while ((*u++ = *s2++))
15334 if (n-- == 0)
15335 break;
15336 return s1;
15337}
15338
15339Py_UNICODE*
15340Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15341{
15342 Py_UNICODE *u1 = s1;
15343 u1 += Py_UNICODE_strlen(u1);
15344 Py_UNICODE_strcpy(u1, s2);
15345 return s1;
15346}
15347
15348int
15349Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15350{
15351 while (*s1 && *s2 && *s1 == *s2)
15352 s1++, s2++;
15353 if (*s1 && *s2)
15354 return (*s1 < *s2) ? -1 : +1;
15355 if (*s1)
15356 return 1;
15357 if (*s2)
15358 return -1;
15359 return 0;
15360}
15361
15362int
15363Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15364{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015365 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015366 for (; n != 0; n--) {
15367 u1 = *s1;
15368 u2 = *s2;
15369 if (u1 != u2)
15370 return (u1 < u2) ? -1 : +1;
15371 if (u1 == '\0')
15372 return 0;
15373 s1++;
15374 s2++;
15375 }
15376 return 0;
15377}
15378
15379Py_UNICODE*
15380Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15381{
15382 const Py_UNICODE *p;
15383 for (p = s; *p; p++)
15384 if (*p == c)
15385 return (Py_UNICODE*)p;
15386 return NULL;
15387}
15388
15389Py_UNICODE*
15390Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15391{
15392 const Py_UNICODE *p;
15393 p = s + Py_UNICODE_strlen(s);
15394 while (p != s) {
15395 p--;
15396 if (*p == c)
15397 return (Py_UNICODE*)p;
15398 }
15399 return NULL;
15400}
Victor Stinner331ea922010-08-10 16:37:20 +000015401
Victor Stinner71133ff2010-09-01 23:43:53 +000015402Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015403PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015404{
Victor Stinner577db2c2011-10-11 22:12:48 +020015405 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015406 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015408 if (!PyUnicode_Check(unicode)) {
15409 PyErr_BadArgument();
15410 return NULL;
15411 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015412 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015413 if (u == NULL)
15414 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015415 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015416 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015417 PyErr_NoMemory();
15418 return NULL;
15419 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015420 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015421 size *= sizeof(Py_UNICODE);
15422 copy = PyMem_Malloc(size);
15423 if (copy == NULL) {
15424 PyErr_NoMemory();
15425 return NULL;
15426 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015427 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015428 return copy;
15429}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015430
Georg Brandl66c221e2010-10-14 07:04:07 +000015431/* A _string module, to export formatter_parser and formatter_field_name_split
15432 to the string.Formatter class implemented in Python. */
15433
15434static PyMethodDef _string_methods[] = {
15435 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15436 METH_O, PyDoc_STR("split the argument as a field name")},
15437 {"formatter_parser", (PyCFunction) formatter_parser,
15438 METH_O, PyDoc_STR("parse the argument as a format string")},
15439 {NULL, NULL}
15440};
15441
15442static struct PyModuleDef _string_module = {
15443 PyModuleDef_HEAD_INIT,
15444 "_string",
15445 PyDoc_STR("string helper module"),
15446 0,
15447 _string_methods,
15448 NULL,
15449 NULL,
15450 NULL,
15451 NULL
15452};
15453
15454PyMODINIT_FUNC
15455PyInit__string(void)
15456{
15457 return PyModule_Create(&_string_module);
15458}
15459
15460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015461#ifdef __cplusplus
15462}
15463#endif