blob: fdc3197470c4e552a7e92d31684b3633a4b31e41 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060043#include "internal/pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050045#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070046#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Larry Hastings61272b72014-01-07 12:41:53 -080052/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090053class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080054[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090055/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57/*[python input]
58class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68[python start generated code]*/
69/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080070
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000071/* --- Globals ------------------------------------------------------------
72
Serhiy Storchaka05997252013-01-26 12:14:02 +020073NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000076
77*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000078
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000079
80#ifdef __cplusplus
81extern "C" {
82#endif
83
Victor Stinner8faf8212011-12-08 22:14:11 +010084/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85#define MAX_UNICODE 0x10ffff
86
Victor Stinner910337b2011-10-03 03:20:16 +020087#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020088# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020089#else
90# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020092
Victor Stinnere90fe6a2011-10-01 16:48:13 +020093#define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020096 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020097 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200101#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200104 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111#define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113#define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115#define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117#define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200125#define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200127
Victor Stinner910337b2011-10-03 03:20:16 +0200128#undef PyUnicode_READY
129#define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200132 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100133 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200134
Victor Stinnerc379ead2011-10-03 12:52:27 +0200135#define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139#define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
Victor Stinner829c0ad2011-10-03 01:08:02 +0200143/* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200145#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200147 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
Victor Stinner03490912011-10-03 23:45:12 +0200150/* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200153 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
Victor Stinner910337b2011-10-03 03:20:16 +0200157/* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200176 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200179 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200180
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200181#ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183# define OVERALLOCATE_FACTOR 2
184#else
185 /* On Linux, overallocate by 25% is the best factor */
186# define OVERALLOCATE_FACTOR 4
187#endif
188
Walter Dörwald16807132007-05-25 13:52:07 +0000189/* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000196*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200197static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000198
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200201
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200209 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000214
Serhiy Storchaka678db842013-01-26 12:16:36 +0200215#define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000220
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200221/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700222static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200223_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200225/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000228/* Single character Unicode strings in the Latin-1 range are being
229 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200230static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Christian Heimes190d79e2008-01-30 11:58:22 +0000232/* Fast detection of the most frequent whitespace characters */
233const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000235/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000238/* case 0x000C: * FORM FEED */
239/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 1, 1, 1, 1, 1, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000242/* case 0x001C: * FILE SEPARATOR */
243/* case 0x001D: * GROUP SEPARATOR */
244/* case 0x001E: * RECORD SEPARATOR */
245/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000246 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000247/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 1, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000252
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000261};
262
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200263/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200264static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200265static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100266static int unicode_modifiable(PyObject *unicode);
267
Victor Stinnerfe226c02011-10-03 03:52:20 +0200268
Alexander Belopolsky40018472011-02-26 01:02:56 +0000269static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100270_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200271static PyObject *
272_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
273static PyObject *
274_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
275
276static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000278 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100279 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000280 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
281
Alexander Belopolsky40018472011-02-26 01:02:56 +0000282static void
283raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300284 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100285 PyObject *unicode,
286 Py_ssize_t startpos, Py_ssize_t endpos,
287 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000288
Christian Heimes190d79e2008-01-30 11:58:22 +0000289/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200290static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* 0x000B, * LINE TABULATION */
294/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* 0x001C, * FILE SEPARATOR */
299/* 0x001D, * GROUP SEPARATOR */
300/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
INADA Naoki3ae20562017-01-16 20:41:20 +0900317static int convert_uc(PyObject *obj, void *addr);
318
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300319#include "clinic/unicodeobject.c.h"
320
Victor Stinner50149202015-09-22 00:26:54 +0200321typedef enum {
322 _Py_ERROR_UNKNOWN=0,
323 _Py_ERROR_STRICT,
324 _Py_ERROR_SURROGATEESCAPE,
325 _Py_ERROR_REPLACE,
326 _Py_ERROR_IGNORE,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200327 _Py_ERROR_BACKSLASHREPLACE,
328 _Py_ERROR_SURROGATEPASS,
Victor Stinner50149202015-09-22 00:26:54 +0200329 _Py_ERROR_XMLCHARREFREPLACE,
330 _Py_ERROR_OTHER
331} _Py_error_handler;
332
333static _Py_error_handler
334get_error_handler(const char *errors)
335{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200336 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200337 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200338 }
339 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200340 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200341 }
342 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200343 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200344 }
345 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200346 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200347 }
348 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200349 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200350 }
351 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200352 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200353 }
354 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200355 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200356 }
Victor Stinner50149202015-09-22 00:26:54 +0200357 return _Py_ERROR_OTHER;
358}
359
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300360/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
361 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000362Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000363PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000364{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000365#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000367#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000368 /* This is actually an illegal character, so it should
369 not be passed to unichr. */
370 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000371#endif
372}
373
Victor Stinner910337b2011-10-03 03:20:16 +0200374#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200375int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100376_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200377{
378 PyASCIIObject *ascii;
379 unsigned int kind;
380
381 assert(PyUnicode_Check(op));
382
383 ascii = (PyASCIIObject *)op;
384 kind = ascii->state.kind;
385
Victor Stinnera3b334d2011-10-03 13:53:37 +0200386 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200387 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200388 assert(ascii->state.ready == 1);
389 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200390 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200391 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200392 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200393
Victor Stinnera41463c2011-10-04 01:05:08 +0200394 if (ascii->state.compact == 1) {
395 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200396 assert(kind == PyUnicode_1BYTE_KIND
397 || kind == PyUnicode_2BYTE_KIND
398 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200399 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200400 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200401 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100402 }
403 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200404 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
405
406 data = unicode->data.any;
407 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100408 assert(ascii->length == 0);
409 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200410 assert(ascii->state.compact == 0);
411 assert(ascii->state.ascii == 0);
412 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100413 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200414 assert(ascii->wstr != NULL);
415 assert(data == NULL);
416 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200417 }
418 else {
419 assert(kind == PyUnicode_1BYTE_KIND
420 || kind == PyUnicode_2BYTE_KIND
421 || kind == PyUnicode_4BYTE_KIND);
422 assert(ascii->state.compact == 0);
423 assert(ascii->state.ready == 1);
424 assert(data != NULL);
425 if (ascii->state.ascii) {
426 assert (compact->utf8 == data);
427 assert (compact->utf8_length == ascii->length);
428 }
429 else
430 assert (compact->utf8 != data);
431 }
432 }
433 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200434 if (
435#if SIZEOF_WCHAR_T == 2
436 kind == PyUnicode_2BYTE_KIND
437#else
438 kind == PyUnicode_4BYTE_KIND
439#endif
440 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200441 {
442 assert(ascii->wstr == data);
443 assert(compact->wstr_length == ascii->length);
444 } else
445 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200446 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200447
448 if (compact->utf8 == NULL)
449 assert(compact->utf8_length == 0);
450 if (ascii->wstr == NULL)
451 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200452 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200453 /* check that the best kind is used */
454 if (check_content && kind != PyUnicode_WCHAR_KIND)
455 {
456 Py_ssize_t i;
457 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200458 void *data;
459 Py_UCS4 ch;
460
461 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200462 for (i=0; i < ascii->length; i++)
463 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200464 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200465 if (ch > maxchar)
466 maxchar = ch;
467 }
468 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100469 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200470 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100471 assert(maxchar <= 255);
472 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200473 else
474 assert(maxchar < 128);
475 }
Victor Stinner77faf692011-11-20 18:56:05 +0100476 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200477 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100478 assert(maxchar <= 0xFFFF);
479 }
480 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200481 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100482 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100483 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200484 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200485 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400486 return 1;
487}
Victor Stinner910337b2011-10-03 03:20:16 +0200488#endif
489
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100490static PyObject*
491unicode_result_wchar(PyObject *unicode)
492{
493#ifndef Py_DEBUG
494 Py_ssize_t len;
495
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100496 len = _PyUnicode_WSTR_LENGTH(unicode);
497 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100498 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100500 }
501
502 if (len == 1) {
503 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100504 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100505 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
506 Py_DECREF(unicode);
507 return latin1_char;
508 }
509 }
510
511 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200512 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100513 return NULL;
514 }
515#else
Victor Stinneraa771272012-10-04 02:32:58 +0200516 assert(Py_REFCNT(unicode) == 1);
517
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100518 /* don't make the result ready in debug mode to ensure that the caller
519 makes the string ready before using it */
520 assert(_PyUnicode_CheckConsistency(unicode, 1));
521#endif
522 return unicode;
523}
524
525static PyObject*
526unicode_result_ready(PyObject *unicode)
527{
528 Py_ssize_t length;
529
530 length = PyUnicode_GET_LENGTH(unicode);
531 if (length == 0) {
532 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100533 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200534 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100535 }
536 return unicode_empty;
537 }
538
539 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200540 void *data = PyUnicode_DATA(unicode);
541 int kind = PyUnicode_KIND(unicode);
542 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100543 if (ch < 256) {
544 PyObject *latin1_char = unicode_latin1[ch];
545 if (latin1_char != NULL) {
546 if (unicode != latin1_char) {
547 Py_INCREF(latin1_char);
548 Py_DECREF(unicode);
549 }
550 return latin1_char;
551 }
552 else {
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 Py_INCREF(unicode);
555 unicode_latin1[ch] = unicode;
556 return unicode;
557 }
558 }
559 }
560
561 assert(_PyUnicode_CheckConsistency(unicode, 1));
562 return unicode;
563}
564
565static PyObject*
566unicode_result(PyObject *unicode)
567{
568 assert(_PyUnicode_CHECK(unicode));
569 if (PyUnicode_IS_READY(unicode))
570 return unicode_result_ready(unicode);
571 else
572 return unicode_result_wchar(unicode);
573}
574
Victor Stinnerc4b49542011-12-11 22:44:26 +0100575static PyObject*
576unicode_result_unchanged(PyObject *unicode)
577{
578 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500579 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100580 return NULL;
581 Py_INCREF(unicode);
582 return unicode;
583 }
584 else
585 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100586 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100587}
588
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200589/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
590 ASCII, Latin1, UTF-8, etc. */
591static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200592backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200593 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
594{
Victor Stinnerad771582015-10-09 12:38:53 +0200595 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200596 Py_UCS4 ch;
597 enum PyUnicode_Kind kind;
598 void *data;
599
600 assert(PyUnicode_IS_READY(unicode));
601 kind = PyUnicode_KIND(unicode);
602 data = PyUnicode_DATA(unicode);
603
604 size = 0;
605 /* determine replacement size */
606 for (i = collstart; i < collend; ++i) {
607 Py_ssize_t incr;
608
609 ch = PyUnicode_READ(kind, data, i);
610 if (ch < 0x100)
611 incr = 2+2;
612 else if (ch < 0x10000)
613 incr = 2+4;
614 else {
615 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200616 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200617 }
618 if (size > PY_SSIZE_T_MAX - incr) {
619 PyErr_SetString(PyExc_OverflowError,
620 "encoded result is too long for a Python string");
621 return NULL;
622 }
623 size += incr;
624 }
625
Victor Stinnerad771582015-10-09 12:38:53 +0200626 str = _PyBytesWriter_Prepare(writer, str, size);
627 if (str == NULL)
628 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200629
630 /* generate replacement */
631 for (i = collstart; i < collend; ++i) {
632 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200633 *str++ = '\\';
634 if (ch >= 0x00010000) {
635 *str++ = 'U';
636 *str++ = Py_hexdigits[(ch>>28)&0xf];
637 *str++ = Py_hexdigits[(ch>>24)&0xf];
638 *str++ = Py_hexdigits[(ch>>20)&0xf];
639 *str++ = Py_hexdigits[(ch>>16)&0xf];
640 *str++ = Py_hexdigits[(ch>>12)&0xf];
641 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200642 }
Victor Stinner797485e2015-10-09 03:17:30 +0200643 else if (ch >= 0x100) {
644 *str++ = 'u';
645 *str++ = Py_hexdigits[(ch>>12)&0xf];
646 *str++ = Py_hexdigits[(ch>>8)&0xf];
647 }
648 else
649 *str++ = 'x';
650 *str++ = Py_hexdigits[(ch>>4)&0xf];
651 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200652 }
653 return str;
654}
655
656/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 10)
678 incr = 2+1+1;
679 else if (ch < 100)
680 incr = 2+2+1;
681 else if (ch < 1000)
682 incr = 2+3+1;
683 else if (ch < 10000)
684 incr = 2+4+1;
685 else if (ch < 100000)
686 incr = 2+5+1;
687 else if (ch < 1000000)
688 incr = 2+6+1;
689 else {
690 assert(ch <= MAX_UNICODE);
691 incr = 2+7+1;
692 }
693 if (size > PY_SSIZE_T_MAX - incr) {
694 PyErr_SetString(PyExc_OverflowError,
695 "encoded result is too long for a Python string");
696 return NULL;
697 }
698 size += incr;
699 }
700
Victor Stinnerad771582015-10-09 12:38:53 +0200701 str = _PyBytesWriter_Prepare(writer, str, size);
702 if (str == NULL)
703 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704
705 /* generate replacement */
706 for (i = collstart; i < collend; ++i) {
707 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
708 }
709 return str;
710}
711
Thomas Wouters477c8d52006-05-27 19:21:47 +0000712/* --- Bloom Filters ----------------------------------------------------- */
713
714/* stuff to implement simple "bloom filters" for Unicode characters.
715 to keep things simple, we use a single bitmask, using the least 5
716 bits from each unicode characters as the bit index. */
717
718/* the linebreak mask is set up by Unicode_Init below */
719
Antoine Pitrouf068f942010-01-13 14:19:12 +0000720#if LONG_BIT >= 128
721#define BLOOM_WIDTH 128
722#elif LONG_BIT >= 64
723#define BLOOM_WIDTH 64
724#elif LONG_BIT >= 32
725#define BLOOM_WIDTH 32
726#else
727#error "LONG_BIT is smaller than 32"
728#endif
729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730#define BLOOM_MASK unsigned long
731
Serhiy Storchaka05997252013-01-26 12:14:02 +0200732static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000733
Antoine Pitrouf068f942010-01-13 14:19:12 +0000734#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000735
Benjamin Peterson29060642009-01-31 22:14:21 +0000736#define BLOOM_LINEBREAK(ch) \
737 ((ch) < 128U ? ascii_linebreak[(ch)] : \
738 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000739
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700740static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000742{
Victor Stinnera85af502013-04-09 21:53:54 +0200743#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
744 do { \
745 TYPE *data = (TYPE *)PTR; \
746 TYPE *end = data + LEN; \
747 Py_UCS4 ch; \
748 for (; data != end; data++) { \
749 ch = *data; \
750 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
751 } \
752 break; \
753 } while (0)
754
Thomas Wouters477c8d52006-05-27 19:21:47 +0000755 /* calculate simple bloom-style bitmask for a given unicode string */
756
Antoine Pitrouf068f942010-01-13 14:19:12 +0000757 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000758
759 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200760 switch (kind) {
761 case PyUnicode_1BYTE_KIND:
762 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
763 break;
764 case PyUnicode_2BYTE_KIND:
765 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
766 break;
767 case PyUnicode_4BYTE_KIND:
768 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
769 break;
770 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700771 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200772 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000773 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200774
775#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000776}
777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300778static int
779ensure_unicode(PyObject *obj)
780{
781 if (!PyUnicode_Check(obj)) {
782 PyErr_Format(PyExc_TypeError,
783 "must be str, not %.100s",
784 Py_TYPE(obj)->tp_name);
785 return -1;
786 }
787 return PyUnicode_READY(obj);
788}
789
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200790/* Compilation of templated routines */
791
792#include "stringlib/asciilib.h"
793#include "stringlib/fastsearch.h"
794#include "stringlib/partition.h"
795#include "stringlib/split.h"
796#include "stringlib/count.h"
797#include "stringlib/find.h"
798#include "stringlib/find_max_char.h"
799#include "stringlib/localeutil.h"
800#include "stringlib/undef.h"
801
802#include "stringlib/ucs1lib.h"
803#include "stringlib/fastsearch.h"
804#include "stringlib/partition.h"
805#include "stringlib/split.h"
806#include "stringlib/count.h"
807#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300808#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200809#include "stringlib/find_max_char.h"
810#include "stringlib/localeutil.h"
811#include "stringlib/undef.h"
812
813#include "stringlib/ucs2lib.h"
814#include "stringlib/fastsearch.h"
815#include "stringlib/partition.h"
816#include "stringlib/split.h"
817#include "stringlib/count.h"
818#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300819#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200820#include "stringlib/find_max_char.h"
821#include "stringlib/localeutil.h"
822#include "stringlib/undef.h"
823
824#include "stringlib/ucs4lib.h"
825#include "stringlib/fastsearch.h"
826#include "stringlib/partition.h"
827#include "stringlib/split.h"
828#include "stringlib/count.h"
829#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300830#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200831#include "stringlib/find_max_char.h"
832#include "stringlib/localeutil.h"
833#include "stringlib/undef.h"
834
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200835#include "stringlib/unicodedefs.h"
836#include "stringlib/fastsearch.h"
837#include "stringlib/count.h"
838#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100839#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200840
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841/* --- Unicode Object ----------------------------------------------------- */
842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200844fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700846static inline Py_ssize_t
847findchar(const void *s, int kind,
848 Py_ssize_t size, Py_UCS4 ch,
849 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200851 switch (kind) {
852 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200853 if ((Py_UCS1) ch != ch)
854 return -1;
855 if (direction > 0)
856 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
857 else
858 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200859 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200860 if ((Py_UCS2) ch != ch)
861 return -1;
862 if (direction > 0)
863 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
864 else
865 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200866 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200867 if (direction > 0)
868 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
869 else
870 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200871 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700872 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874}
875
Victor Stinnerafffce42012-10-03 23:03:17 +0200876#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000877/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200878 earlier.
879
880 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
881 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
882 invalid character in Unicode 6.0. */
883static void
884unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
885{
886 int kind = PyUnicode_KIND(unicode);
887 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
888 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
889 if (length <= old_length)
890 return;
891 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
892}
893#endif
894
Victor Stinnerfe226c02011-10-03 03:52:20 +0200895static PyObject*
896resize_compact(PyObject *unicode, Py_ssize_t length)
897{
898 Py_ssize_t char_size;
899 Py_ssize_t struct_size;
900 Py_ssize_t new_size;
901 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100902 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200903#ifdef Py_DEBUG
904 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
905#endif
906
Victor Stinner79891572012-05-03 13:43:07 +0200907 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200908 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100909 assert(PyUnicode_IS_COMPACT(unicode));
910
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200911 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100912 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200913 struct_size = sizeof(PyASCIIObject);
914 else
915 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200916 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200917
Victor Stinnerfe226c02011-10-03 03:52:20 +0200918 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
919 PyErr_NoMemory();
920 return NULL;
921 }
922 new_size = (struct_size + (length + 1) * char_size);
923
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
925 PyObject_DEL(_PyUnicode_UTF8(unicode));
926 _PyUnicode_UTF8(unicode) = NULL;
927 _PyUnicode_UTF8_LENGTH(unicode) = 0;
928 }
Victor Stinner84def372011-12-11 20:04:56 +0100929 _Py_DEC_REFTOTAL;
930 _Py_ForgetReference(unicode);
931
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300932 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100933 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100934 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200935 PyErr_NoMemory();
936 return NULL;
937 }
Victor Stinner84def372011-12-11 20:04:56 +0100938 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200939 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100940
Victor Stinnerfe226c02011-10-03 03:52:20 +0200941 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200942 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200943 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100944 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200945 _PyUnicode_WSTR_LENGTH(unicode) = length;
946 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100947 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
948 PyObject_DEL(_PyUnicode_WSTR(unicode));
949 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100950 if (!PyUnicode_IS_ASCII(unicode))
951 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100952 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200953#ifdef Py_DEBUG
954 unicode_fill_invalid(unicode, old_length);
955#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200956 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
957 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200958 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200959 return unicode;
960}
961
Alexander Belopolsky40018472011-02-26 01:02:56 +0000962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200963resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964{
Victor Stinner95663112011-10-04 01:03:50 +0200965 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100966 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000969
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 if (PyUnicode_IS_READY(unicode)) {
971 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200972 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200974#ifdef Py_DEBUG
975 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
976#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
978 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200979 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200980 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
981 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200982
983 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
984 PyErr_NoMemory();
985 return -1;
986 }
987 new_size = (length + 1) * char_size;
988
Victor Stinner7a9105a2011-12-12 00:13:42 +0100989 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
990 {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
995
Victor Stinnerfe226c02011-10-03 03:52:20 +0200996 data = (PyObject *)PyObject_REALLOC(data, new_size);
997 if (data == NULL) {
998 PyErr_NoMemory();
999 return -1;
1000 }
1001 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 _PyUnicode_WSTR_LENGTH(unicode) = length;
1005 }
1006 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001007 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 _PyUnicode_UTF8_LENGTH(unicode) = length;
1009 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001010 _PyUnicode_LENGTH(unicode) = length;
1011 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001012#ifdef Py_DEBUG
1013 unicode_fill_invalid(unicode, old_length);
1014#endif
Victor Stinner95663112011-10-04 01:03:50 +02001015 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001016 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 }
Victor Stinner95663112011-10-04 01:03:50 +02001020 assert(_PyUnicode_WSTR(unicode) != NULL);
1021
1022 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001023 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001024 PyErr_NoMemory();
1025 return -1;
1026 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001027 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001028 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001029 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001030 if (!wstr) {
1031 PyErr_NoMemory();
1032 return -1;
1033 }
1034 _PyUnicode_WSTR(unicode) = wstr;
1035 _PyUnicode_WSTR(unicode)[length] = 0;
1036 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001037 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038 return 0;
1039}
1040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041static PyObject*
1042resize_copy(PyObject *unicode, Py_ssize_t length)
1043{
1044 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001045 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001047
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001048 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049
1050 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051 if (copy == NULL)
1052 return NULL;
1053
1054 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001055 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001057 }
1058 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001059 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001060
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001061 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 if (w == NULL)
1063 return NULL;
1064 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1065 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001066 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001067 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 }
1070}
1071
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001073 Ux0000 terminated; some code (e.g. new_identifier)
1074 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075
1076 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001077 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078
1079*/
1080
Alexander Belopolsky40018472011-02-26 01:02:56 +00001081static PyUnicodeObject *
1082_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001083{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001084 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086
Thomas Wouters477c8d52006-05-27 19:21:47 +00001087 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001088 if (length == 0 && unicode_empty != NULL) {
1089 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001090 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 }
1092
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001093 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001094 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001095 return (PyUnicodeObject *)PyErr_NoMemory();
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 if (length < 0) {
1098 PyErr_SetString(PyExc_SystemError,
1099 "Negative size passed to _PyUnicode_New");
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
1102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1104 if (unicode == NULL)
1105 return NULL;
1106 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001107
1108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
1111 _PyUnicode_STATE(unicode).kind = 0;
1112 _PyUnicode_STATE(unicode).compact = 0;
1113 _PyUnicode_STATE(unicode).ready = 0;
1114 _PyUnicode_STATE(unicode).ascii = 0;
1115 _PyUnicode_DATA_ANY(unicode) = NULL;
1116 _PyUnicode_LENGTH(unicode) = 0;
1117 _PyUnicode_UTF8(unicode) = NULL;
1118 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1121 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001122 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001123 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001124 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126
Jeremy Hyltond8082792003-09-16 19:41:39 +00001127 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001128 * the caller fails before initializing str -- unicode_resize()
1129 * reads str[0], and the Keep-Alive optimization can keep memory
1130 * allocated for str alive across a call to unicode_dealloc(unicode).
1131 * We don't want unicode_resize to read uninitialized memory in
1132 * that case.
1133 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_WSTR(unicode)[0] = 0;
1135 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001136
Victor Stinner7931d9a2011-11-04 00:22:48 +01001137 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 return unicode;
1139}
1140
Victor Stinnerf42dc442011-10-02 23:33:16 +02001141static const char*
1142unicode_kind_name(PyObject *unicode)
1143{
Victor Stinner42dfd712011-10-03 14:41:45 +02001144 /* don't check consistency: unicode_kind_name() is called from
1145 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 if (!PyUnicode_IS_COMPACT(unicode))
1147 {
1148 if (!PyUnicode_IS_READY(unicode))
1149 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001150 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
1152 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001153 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001154 return "legacy ascii";
1155 else
1156 return "legacy latin1";
1157 case PyUnicode_2BYTE_KIND:
1158 return "legacy UCS2";
1159 case PyUnicode_4BYTE_KIND:
1160 return "legacy UCS4";
1161 default:
1162 return "<legacy invalid kind>";
1163 }
1164 }
1165 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001166 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001167 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001168 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001169 return "ascii";
1170 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001171 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001173 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001175 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001176 default:
1177 return "<invalid compact kind>";
1178 }
1179}
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182/* Functions wrapping macros for use in debugger */
1183char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185}
1186
1187void *_PyUnicode_compact_data(void *unicode) {
1188 return _PyUnicode_COMPACT_DATA(unicode);
1189}
1190void *_PyUnicode_data(void *unicode){
1191 printf("obj %p\n", unicode);
1192 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1193 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1194 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1195 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1196 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1197 return PyUnicode_DATA(unicode);
1198}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001199
1200void
1201_PyUnicode_Dump(PyObject *op)
1202{
1203 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001204 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1205 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1206 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001207
Victor Stinnera849a4b2011-10-03 12:12:11 +02001208 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001209 {
1210 if (ascii->state.ascii)
1211 data = (ascii + 1);
1212 else
1213 data = (compact + 1);
1214 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001215 else
1216 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001217 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1218 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001219
Victor Stinnera849a4b2011-10-03 12:12:11 +02001220 if (ascii->wstr == data)
1221 printf("shared ");
1222 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001223
Victor Stinnera3b334d2011-10-03 13:53:37 +02001224 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001225 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001226 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1227 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001228 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1229 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001230 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001231 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001232}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233#endif
1234
1235PyObject *
1236PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1237{
1238 PyObject *obj;
1239 PyCompactUnicodeObject *unicode;
1240 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001241 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001242 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 Py_ssize_t char_size;
1244 Py_ssize_t struct_size;
1245
1246 /* Optimization for empty strings */
1247 if (size == 0 && unicode_empty != NULL) {
1248 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001249 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 }
1251
Victor Stinner9e9d6892011-10-04 01:02:02 +02001252 is_ascii = 0;
1253 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 struct_size = sizeof(PyCompactUnicodeObject);
1255 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001256 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 char_size = 1;
1258 is_ascii = 1;
1259 struct_size = sizeof(PyASCIIObject);
1260 }
1261 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001262 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001263 char_size = 1;
1264 }
1265 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001266 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267 char_size = 2;
1268 if (sizeof(wchar_t) == 2)
1269 is_sharing = 1;
1270 }
1271 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001272 if (maxchar > MAX_UNICODE) {
1273 PyErr_SetString(PyExc_SystemError,
1274 "invalid maximum character passed to PyUnicode_New");
1275 return NULL;
1276 }
Victor Stinner8f825062012-04-27 13:55:39 +02001277 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 char_size = 4;
1279 if (sizeof(wchar_t) == 4)
1280 is_sharing = 1;
1281 }
1282
1283 /* Ensure we won't overflow the size. */
1284 if (size < 0) {
1285 PyErr_SetString(PyExc_SystemError,
1286 "Negative size passed to PyUnicode_New");
1287 return NULL;
1288 }
1289 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1290 return PyErr_NoMemory();
1291
1292 /* Duplicated allocation code from _PyObject_New() instead of a call to
1293 * PyObject_New() so we are able to allocate space for the object and
1294 * it's data buffer.
1295 */
1296 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1297 if (obj == NULL)
1298 return PyErr_NoMemory();
1299 obj = PyObject_INIT(obj, &PyUnicode_Type);
1300 if (obj == NULL)
1301 return NULL;
1302
1303 unicode = (PyCompactUnicodeObject *)obj;
1304 if (is_ascii)
1305 data = ((PyASCIIObject*)obj) + 1;
1306 else
1307 data = unicode + 1;
1308 _PyUnicode_LENGTH(unicode) = size;
1309 _PyUnicode_HASH(unicode) = -1;
1310 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001311 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 _PyUnicode_STATE(unicode).compact = 1;
1313 _PyUnicode_STATE(unicode).ready = 1;
1314 _PyUnicode_STATE(unicode).ascii = is_ascii;
1315 if (is_ascii) {
1316 ((char*)data)[size] = 0;
1317 _PyUnicode_WSTR(unicode) = NULL;
1318 }
Victor Stinner8f825062012-04-27 13:55:39 +02001319 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 ((char*)data)[size] = 0;
1321 _PyUnicode_WSTR(unicode) = NULL;
1322 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001324 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 else {
1327 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001328 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001329 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001331 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 ((Py_UCS4*)data)[size] = 0;
1333 if (is_sharing) {
1334 _PyUnicode_WSTR_LENGTH(unicode) = size;
1335 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1336 }
1337 else {
1338 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 }
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001343 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001344#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001345 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 return obj;
1347}
1348
1349#if SIZEOF_WCHAR_T == 2
1350/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1351 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001352 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353
1354 This function assumes that unicode can hold one more code point than wstr
1355 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001356static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001358 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359{
1360 const wchar_t *iter;
1361 Py_UCS4 *ucs4_out;
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 assert(unicode != NULL);
1364 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1366 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1367
1368 for (iter = begin; iter < end; ) {
1369 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1370 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001371 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1372 && (iter+1) < end
1373 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 {
Victor Stinner551ac952011-11-29 22:58:13 +01001375 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 iter += 2;
1377 }
1378 else {
1379 *ucs4_out++ = *iter;
1380 iter++;
1381 }
1382 }
1383 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1384 _PyUnicode_GET_LENGTH(unicode)));
1385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386}
1387#endif
1388
Victor Stinnercd9950f2011-10-02 00:34:53 +02001389static int
Victor Stinner488fa492011-12-12 00:01:39 +01001390unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001391{
Victor Stinner488fa492011-12-12 00:01:39 +01001392 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001393 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001394 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001395 return -1;
1396 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001397 return 0;
1398}
1399
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001400static int
1401_copy_characters(PyObject *to, Py_ssize_t to_start,
1402 PyObject *from, Py_ssize_t from_start,
1403 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 unsigned int from_kind, to_kind;
1406 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407
Victor Stinneree4544c2012-05-09 22:24:08 +02001408 assert(0 <= how_many);
1409 assert(0 <= from_start);
1410 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001411 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001412 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001413 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Victor Stinnerd3f08822012-05-29 12:57:52 +02001415 assert(PyUnicode_Check(to));
1416 assert(PyUnicode_IS_READY(to));
1417 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1418
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001419 if (how_many == 0)
1420 return 0;
1421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001423 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001425 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426
Victor Stinnerf1852262012-06-16 16:38:26 +02001427#ifdef Py_DEBUG
1428 if (!check_maxchar
1429 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1430 {
1431 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1432 Py_UCS4 ch;
1433 Py_ssize_t i;
1434 for (i=0; i < how_many; i++) {
1435 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1436 assert(ch <= to_maxchar);
1437 }
1438 }
1439#endif
1440
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001441 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001442 if (check_maxchar
1443 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1444 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001445 /* Writing Latin-1 characters into an ASCII string requires to
1446 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001447 Py_UCS4 max_char;
1448 max_char = ucs1lib_find_max_char(from_data,
1449 (Py_UCS1*)from_data + how_many);
1450 if (max_char >= 128)
1451 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001452 }
Christian Heimesf051e432016-09-13 20:22:02 +02001453 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001454 (char*)from_data + from_kind * from_start,
1455 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001457 else if (from_kind == PyUnicode_1BYTE_KIND
1458 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001459 {
1460 _PyUnicode_CONVERT_BYTES(
1461 Py_UCS1, Py_UCS2,
1462 PyUnicode_1BYTE_DATA(from) + from_start,
1463 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1464 PyUnicode_2BYTE_DATA(to) + to_start
1465 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001466 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001467 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001468 && to_kind == PyUnicode_4BYTE_KIND)
1469 {
1470 _PyUnicode_CONVERT_BYTES(
1471 Py_UCS1, Py_UCS4,
1472 PyUnicode_1BYTE_DATA(from) + from_start,
1473 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1474 PyUnicode_4BYTE_DATA(to) + to_start
1475 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001476 }
1477 else if (from_kind == PyUnicode_2BYTE_KIND
1478 && to_kind == PyUnicode_4BYTE_KIND)
1479 {
1480 _PyUnicode_CONVERT_BYTES(
1481 Py_UCS2, Py_UCS4,
1482 PyUnicode_2BYTE_DATA(from) + from_start,
1483 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1484 PyUnicode_4BYTE_DATA(to) + to_start
1485 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001486 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001487 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1489
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001490 if (!check_maxchar) {
1491 if (from_kind == PyUnicode_2BYTE_KIND
1492 && to_kind == PyUnicode_1BYTE_KIND)
1493 {
1494 _PyUnicode_CONVERT_BYTES(
1495 Py_UCS2, Py_UCS1,
1496 PyUnicode_2BYTE_DATA(from) + from_start,
1497 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1498 PyUnicode_1BYTE_DATA(to) + to_start
1499 );
1500 }
1501 else if (from_kind == PyUnicode_4BYTE_KIND
1502 && to_kind == PyUnicode_1BYTE_KIND)
1503 {
1504 _PyUnicode_CONVERT_BYTES(
1505 Py_UCS4, Py_UCS1,
1506 PyUnicode_4BYTE_DATA(from) + from_start,
1507 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1508 PyUnicode_1BYTE_DATA(to) + to_start
1509 );
1510 }
1511 else if (from_kind == PyUnicode_4BYTE_KIND
1512 && to_kind == PyUnicode_2BYTE_KIND)
1513 {
1514 _PyUnicode_CONVERT_BYTES(
1515 Py_UCS4, Py_UCS2,
1516 PyUnicode_4BYTE_DATA(from) + from_start,
1517 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1518 PyUnicode_2BYTE_DATA(to) + to_start
1519 );
1520 }
1521 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001522 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001523 }
1524 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001525 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001526 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001527 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001528 Py_ssize_t i;
1529
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 for (i=0; i < how_many; i++) {
1531 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001532 if (ch > to_maxchar)
1533 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001534 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1535 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001536 }
1537 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 return 0;
1539}
1540
Victor Stinnerd3f08822012-05-29 12:57:52 +02001541void
1542_PyUnicode_FastCopyCharacters(
1543 PyObject *to, Py_ssize_t to_start,
1544 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001545{
1546 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1547}
1548
1549Py_ssize_t
1550PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1551 PyObject *from, Py_ssize_t from_start,
1552 Py_ssize_t how_many)
1553{
1554 int err;
1555
1556 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1557 PyErr_BadInternalCall();
1558 return -1;
1559 }
1560
Benjamin Petersonbac79492012-01-14 13:34:47 -05001561 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001563 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 return -1;
1565
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001566 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567 PyErr_SetString(PyExc_IndexError, "string index out of range");
1568 return -1;
1569 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001570 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001571 PyErr_SetString(PyExc_IndexError, "string index out of range");
1572 return -1;
1573 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001574 if (how_many < 0) {
1575 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1576 return -1;
1577 }
1578 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001579 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1580 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001581 "Cannot write %zi characters at %zi "
1582 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001583 how_many, to_start, PyUnicode_GET_LENGTH(to));
1584 return -1;
1585 }
1586
1587 if (how_many == 0)
1588 return 0;
1589
Victor Stinner488fa492011-12-12 00:01:39 +01001590 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001591 return -1;
1592
1593 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1594 if (err) {
1595 PyErr_Format(PyExc_SystemError,
1596 "Cannot copy %s characters "
1597 "into a string of %s characters",
1598 unicode_kind_name(from),
1599 unicode_kind_name(to));
1600 return -1;
1601 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001602 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603}
1604
Victor Stinner17222162011-09-28 22:15:37 +02001605/* Find the maximum code point and count the number of surrogate pairs so a
1606 correct string length can be computed before converting a string to UCS4.
1607 This function counts single surrogates as a character and not as a pair.
1608
1609 Return 0 on success, or -1 on error. */
1610static int
1611find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1612 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001613{
1614 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001615 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616
Victor Stinnerc53be962011-10-02 21:33:54 +02001617 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618 *num_surrogates = 0;
1619 *maxchar = 0;
1620
1621 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001623 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1624 && (iter+1) < end
1625 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1626 {
1627 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1628 ++(*num_surrogates);
1629 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 }
1631 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001633 {
1634 ch = *iter;
1635 iter++;
1636 }
1637 if (ch > *maxchar) {
1638 *maxchar = ch;
1639 if (*maxchar > MAX_UNICODE) {
1640 PyErr_Format(PyExc_ValueError,
1641 "character U+%x is not in range [U+0000; U+10ffff]",
1642 ch);
1643 return -1;
1644 }
1645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 }
1647 return 0;
1648}
1649
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001650int
1651_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652{
1653 wchar_t *end;
1654 Py_UCS4 maxchar = 0;
1655 Py_ssize_t num_surrogates;
1656#if SIZEOF_WCHAR_T == 2
1657 Py_ssize_t length_wo_surrogates;
1658#endif
1659
Georg Brandl7597add2011-10-05 16:36:47 +02001660 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001661 strings were created using _PyObject_New() and where no canonical
1662 representation (the str field) has been set yet aka strings
1663 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001664 assert(_PyUnicode_CHECK(unicode));
1665 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001667 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001668 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001669 /* Actually, it should neither be interned nor be anything else: */
1670 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001673 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001674 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676
1677 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001678 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1679 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 PyErr_NoMemory();
1681 return -1;
1682 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001683 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001684 _PyUnicode_WSTR(unicode), end,
1685 PyUnicode_1BYTE_DATA(unicode));
1686 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1687 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1688 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1689 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001690 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001691 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001692 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001695 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001696 _PyUnicode_UTF8(unicode) = NULL;
1697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 }
1699 PyObject_FREE(_PyUnicode_WSTR(unicode));
1700 _PyUnicode_WSTR(unicode) = NULL;
1701 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1702 }
1703 /* In this case we might have to convert down from 4-byte native
1704 wchar_t to 2-byte unicode. */
1705 else if (maxchar < 65536) {
1706 assert(num_surrogates == 0 &&
1707 "FindMaxCharAndNumSurrogatePairs() messed up");
1708
Victor Stinner506f5922011-09-28 22:34:18 +02001709#if SIZEOF_WCHAR_T == 2
1710 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001711 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001712 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001715 _PyUnicode_UTF8(unicode) = NULL;
1716 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001717#else
1718 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001719 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001720 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001721 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001722 PyErr_NoMemory();
1723 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 }
Victor Stinner506f5922011-09-28 22:34:18 +02001725 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1726 _PyUnicode_WSTR(unicode), end,
1727 PyUnicode_2BYTE_DATA(unicode));
1728 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1729 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1730 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001731 _PyUnicode_UTF8(unicode) = NULL;
1732 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001733 PyObject_FREE(_PyUnicode_WSTR(unicode));
1734 _PyUnicode_WSTR(unicode) = NULL;
1735 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1736#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 }
1738 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1739 else {
1740#if SIZEOF_WCHAR_T == 2
1741 /* in case the native representation is 2-bytes, we need to allocate a
1742 new normalized 4-byte version. */
1743 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001744 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1745 PyErr_NoMemory();
1746 return -1;
1747 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001748 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1749 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 PyErr_NoMemory();
1751 return -1;
1752 }
1753 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1754 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001755 _PyUnicode_UTF8(unicode) = NULL;
1756 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001757 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1758 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001759 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 PyObject_FREE(_PyUnicode_WSTR(unicode));
1761 _PyUnicode_WSTR(unicode) = NULL;
1762 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1763#else
1764 assert(num_surrogates == 0);
1765
Victor Stinnerc3c74152011-10-02 20:39:55 +02001766 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001768 _PyUnicode_UTF8(unicode) = NULL;
1769 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1771#endif
1772 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1773 }
1774 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001775 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 return 0;
1777}
1778
Alexander Belopolsky40018472011-02-26 01:02:56 +00001779static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001780unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781{
Walter Dörwald16807132007-05-25 13:52:07 +00001782 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001783 case SSTATE_NOT_INTERNED:
1784 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001785
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 case SSTATE_INTERNED_MORTAL:
1787 /* revive dead object temporarily for DelItem */
1788 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001789 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001790 Py_FatalError(
1791 "deletion of interned string failed");
1792 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001793
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 case SSTATE_INTERNED_IMMORTAL:
1795 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001796 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001797
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 default:
1799 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001800 }
1801
Victor Stinner03490912011-10-03 23:45:12 +02001802 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001804 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001806 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1807 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001809 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810}
1811
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001812#ifdef Py_DEBUG
1813static int
1814unicode_is_singleton(PyObject *unicode)
1815{
1816 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1817 if (unicode == unicode_empty)
1818 return 1;
1819 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1820 {
1821 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1822 if (ch < 256 && unicode_latin1[ch] == unicode)
1823 return 1;
1824 }
1825 return 0;
1826}
1827#endif
1828
Alexander Belopolsky40018472011-02-26 01:02:56 +00001829static int
Victor Stinner488fa492011-12-12 00:01:39 +01001830unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001831{
Victor Stinner488fa492011-12-12 00:01:39 +01001832 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001833 if (Py_REFCNT(unicode) != 1)
1834 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001835 if (_PyUnicode_HASH(unicode) != -1)
1836 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001837 if (PyUnicode_CHECK_INTERNED(unicode))
1838 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001839 if (!PyUnicode_CheckExact(unicode))
1840 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001841#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001842 /* singleton refcount is greater than 1 */
1843 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001844#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001845 return 1;
1846}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001847
Victor Stinnerfe226c02011-10-03 03:52:20 +02001848static int
1849unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1850{
1851 PyObject *unicode;
1852 Py_ssize_t old_length;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856
1857 assert(unicode != NULL);
1858 assert(PyUnicode_Check(unicode));
1859 assert(0 <= length);
1860
Victor Stinner910337b2011-10-03 03:20:16 +02001861 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001862 old_length = PyUnicode_WSTR_LENGTH(unicode);
1863 else
1864 old_length = PyUnicode_GET_LENGTH(unicode);
1865 if (old_length == length)
1866 return 0;
1867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001868 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001869 _Py_INCREF_UNICODE_EMPTY();
1870 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001871 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001872 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001873 return 0;
1874 }
1875
Victor Stinner488fa492011-12-12 00:01:39 +01001876 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001877 PyObject *copy = resize_copy(unicode, length);
1878 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001879 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001880 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001881 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001882 }
1883
Victor Stinnerfe226c02011-10-03 03:52:20 +02001884 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001885 PyObject *new_unicode = resize_compact(unicode, length);
1886 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001887 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001888 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001889 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001890 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001891 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001892}
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001895PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001896{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001897 PyObject *unicode;
1898 if (p_unicode == NULL) {
1899 PyErr_BadInternalCall();
1900 return -1;
1901 }
1902 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001903 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001904 {
1905 PyErr_BadInternalCall();
1906 return -1;
1907 }
1908 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001911/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001912
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001913 WARNING: The function doesn't copy the terminating null character and
1914 doesn't check the maximum character (may write a latin1 character in an
1915 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001916static void
1917unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1918 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001919{
1920 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1921 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001922 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001923
1924 switch (kind) {
1925 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001926 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001927#ifdef Py_DEBUG
1928 if (PyUnicode_IS_ASCII(unicode)) {
1929 Py_UCS4 maxchar = ucs1lib_find_max_char(
1930 (const Py_UCS1*)str,
1931 (const Py_UCS1*)str + len);
1932 assert(maxchar < 128);
1933 }
1934#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001935 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001936 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001937 }
1938 case PyUnicode_2BYTE_KIND: {
1939 Py_UCS2 *start = (Py_UCS2 *)data + index;
1940 Py_UCS2 *ucs2 = start;
1941 assert(index <= PyUnicode_GET_LENGTH(unicode));
1942
Victor Stinner184252a2012-06-16 02:57:41 +02001943 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001944 *ucs2 = (Py_UCS2)*str;
1945
1946 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001947 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001948 }
1949 default: {
1950 Py_UCS4 *start = (Py_UCS4 *)data + index;
1951 Py_UCS4 *ucs4 = start;
1952 assert(kind == PyUnicode_4BYTE_KIND);
1953 assert(index <= PyUnicode_GET_LENGTH(unicode));
1954
Victor Stinner184252a2012-06-16 02:57:41 +02001955 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001956 *ucs4 = (Py_UCS4)*str;
1957
1958 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001959 }
1960 }
1961}
1962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963static PyObject*
1964get_latin1_char(unsigned char ch)
1965{
Victor Stinnera464fc12011-10-02 20:39:30 +02001966 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001968 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 if (!unicode)
1970 return NULL;
1971 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 unicode_latin1[ch] = unicode;
1974 }
1975 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001976 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977}
1978
Victor Stinner985a82a2014-01-03 12:53:47 +01001979static PyObject*
1980unicode_char(Py_UCS4 ch)
1981{
1982 PyObject *unicode;
1983
1984 assert(ch <= MAX_UNICODE);
1985
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001986 if (ch < 256)
1987 return get_latin1_char(ch);
1988
Victor Stinner985a82a2014-01-03 12:53:47 +01001989 unicode = PyUnicode_New(1, ch);
1990 if (unicode == NULL)
1991 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001992
1993 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1994 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001996 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1998 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1999 }
2000 assert(_PyUnicode_CheckConsistency(unicode, 1));
2001 return unicode;
2002}
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004PyObject *
2005PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002007 if (u == NULL)
2008 return (PyObject*)_PyUnicode_New(size);
2009
2010 if (size < 0) {
2011 PyErr_BadInternalCall();
2012 return NULL;
2013 }
2014
2015 return PyUnicode_FromWideChar(u, size);
2016}
2017
2018PyObject *
2019PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2020{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 Py_UCS4 maxchar = 0;
2023 Py_ssize_t num_surrogates;
2024
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002025 if (u == NULL && size != 0) {
2026 PyErr_BadInternalCall();
2027 return NULL;
2028 }
2029
2030 if (size == -1) {
2031 size = wcslen(u);
2032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002034 /* If the Unicode data is known at construction time, we can apply
2035 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002038 if (size == 0)
2039 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 /* Single character Unicode objects in the Latin-1 range are
2042 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002043 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return get_latin1_char((unsigned char)*u);
2045
2046 /* If not empty and not single character, copy the Unicode data
2047 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002048 if (find_maxchar_surrogates(u, u + size,
2049 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 return NULL;
2051
Victor Stinner8faf8212011-12-08 22:14:11 +01002052 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 if (!unicode)
2054 return NULL;
2055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 switch (PyUnicode_KIND(unicode)) {
2057 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002058 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2060 break;
2061 case PyUnicode_2BYTE_KIND:
2062#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002063 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002065 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2067#endif
2068 break;
2069 case PyUnicode_4BYTE_KIND:
2070#if SIZEOF_WCHAR_T == 2
2071 /* This is the only case which has to process surrogates, thus
2072 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002073 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#else
2075 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002076 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077#endif
2078 break;
2079 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002080 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002083 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084}
2085
Alexander Belopolsky40018472011-02-26 01:02:56 +00002086PyObject *
2087PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002088{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (size < 0) {
2090 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002091 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 return NULL;
2093 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002094 if (u != NULL)
2095 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2096 else
2097 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002098}
2099
Alexander Belopolsky40018472011-02-26 01:02:56 +00002100PyObject *
2101PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102{
2103 size_t size = strlen(u);
2104 if (size > PY_SSIZE_T_MAX) {
2105 PyErr_SetString(PyExc_OverflowError, "input too long");
2106 return NULL;
2107 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002108 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002109}
2110
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002111PyObject *
2112_PyUnicode_FromId(_Py_Identifier *id)
2113{
2114 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002115 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2116 strlen(id->string),
2117 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002118 if (!id->object)
2119 return NULL;
2120 PyUnicode_InternInPlace(&id->object);
2121 assert(!id->next);
2122 id->next = static_strings;
2123 static_strings = id;
2124 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002125 return id->object;
2126}
2127
2128void
2129_PyUnicode_ClearStaticStrings()
2130{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002131 _Py_Identifier *tmp, *s = static_strings;
2132 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002133 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002134 tmp = s->next;
2135 s->next = NULL;
2136 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002137 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002138 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002139}
2140
Benjamin Peterson0df54292012-03-26 14:50:32 -04002141/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002142
Victor Stinnerd3f08822012-05-29 12:57:52 +02002143PyObject*
2144_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002145{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002146 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002147 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002148 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002149#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002151#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002152 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002153 }
Victor Stinner785938e2011-12-11 20:09:03 +01002154 unicode = PyUnicode_New(size, 127);
2155 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002156 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002157 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2158 assert(_PyUnicode_CheckConsistency(unicode, 1));
2159 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002160}
2161
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002162static Py_UCS4
2163kind_maxchar_limit(unsigned int kind)
2164{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002165 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002166 case PyUnicode_1BYTE_KIND:
2167 return 0x80;
2168 case PyUnicode_2BYTE_KIND:
2169 return 0x100;
2170 case PyUnicode_4BYTE_KIND:
2171 return 0x10000;
2172 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002173 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002174 }
2175}
2176
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -07002177static inline Py_UCS4
Victor Stinnere6abb482012-05-02 01:15:40 +02002178align_maxchar(Py_UCS4 maxchar)
2179{
2180 if (maxchar <= 127)
2181 return 127;
2182 else if (maxchar <= 255)
2183 return 255;
2184 else if (maxchar <= 65535)
2185 return 65535;
2186 else
2187 return MAX_UNICODE;
2188}
2189
Victor Stinner702c7342011-10-05 13:50:52 +02002190static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002191_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002194 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002195
Serhiy Storchaka678db842013-01-26 12:16:36 +02002196 if (size == 0)
2197 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002198 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002199 if (size == 1)
2200 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002201
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002202 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002203 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 if (!res)
2205 return NULL;
2206 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002207 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002209}
2210
Victor Stinnere57b1c02011-09-28 22:20:48 +02002211static PyObject*
2212_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213{
2214 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002215 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002216
Serhiy Storchaka678db842013-01-26 12:16:36 +02002217 if (size == 0)
2218 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002219 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002220 if (size == 1)
2221 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002222
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002223 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002224 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 if (!res)
2226 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002227 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002229 else {
2230 _PyUnicode_CONVERT_BYTES(
2231 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2232 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002233 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return res;
2235}
2236
Victor Stinnere57b1c02011-09-28 22:20:48 +02002237static PyObject*
2238_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239{
2240 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002241 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002242
Serhiy Storchaka678db842013-01-26 12:16:36 +02002243 if (size == 0)
2244 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002245 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002246 if (size == 1)
2247 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002249 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002250 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 if (!res)
2252 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002253 if (max_char < 256)
2254 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2255 PyUnicode_1BYTE_DATA(res));
2256 else if (max_char < 0x10000)
2257 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2258 PyUnicode_2BYTE_DATA(res));
2259 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002261 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 return res;
2263}
2264
2265PyObject*
2266PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2267{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002268 if (size < 0) {
2269 PyErr_SetString(PyExc_ValueError, "size must be positive");
2270 return NULL;
2271 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002272 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002274 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002276 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002278 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002279 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 PyErr_SetString(PyExc_SystemError, "invalid kind");
2281 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283}
2284
Victor Stinnerece58de2012-04-23 23:36:38 +02002285Py_UCS4
2286_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2287{
2288 enum PyUnicode_Kind kind;
2289 void *startptr, *endptr;
2290
2291 assert(PyUnicode_IS_READY(unicode));
2292 assert(0 <= start);
2293 assert(end <= PyUnicode_GET_LENGTH(unicode));
2294 assert(start <= end);
2295
2296 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2297 return PyUnicode_MAX_CHAR_VALUE(unicode);
2298
2299 if (start == end)
2300 return 127;
2301
Victor Stinner94d558b2012-04-27 22:26:58 +02002302 if (PyUnicode_IS_ASCII(unicode))
2303 return 127;
2304
Victor Stinnerece58de2012-04-23 23:36:38 +02002305 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002306 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002307 endptr = (char *)startptr + end * kind;
2308 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002309 switch(kind) {
2310 case PyUnicode_1BYTE_KIND:
2311 return ucs1lib_find_max_char(startptr, endptr);
2312 case PyUnicode_2BYTE_KIND:
2313 return ucs2lib_find_max_char(startptr, endptr);
2314 case PyUnicode_4BYTE_KIND:
2315 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002316 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002317 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002318 }
2319}
2320
Victor Stinner25a4b292011-10-06 12:31:55 +02002321/* Ensure that a string uses the most efficient storage, if it is not the
2322 case: create a new string with of the right kind. Write NULL into *p_unicode
2323 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002324static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002325unicode_adjust_maxchar(PyObject **p_unicode)
2326{
2327 PyObject *unicode, *copy;
2328 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002329 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002330 unsigned int kind;
2331
2332 assert(p_unicode != NULL);
2333 unicode = *p_unicode;
2334 assert(PyUnicode_IS_READY(unicode));
2335 if (PyUnicode_IS_ASCII(unicode))
2336 return;
2337
2338 len = PyUnicode_GET_LENGTH(unicode);
2339 kind = PyUnicode_KIND(unicode);
2340 if (kind == PyUnicode_1BYTE_KIND) {
2341 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002342 max_char = ucs1lib_find_max_char(u, u + len);
2343 if (max_char >= 128)
2344 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002345 }
2346 else if (kind == PyUnicode_2BYTE_KIND) {
2347 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002348 max_char = ucs2lib_find_max_char(u, u + len);
2349 if (max_char >= 256)
2350 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002351 }
2352 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002353 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002354 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002355 max_char = ucs4lib_find_max_char(u, u + len);
2356 if (max_char >= 0x10000)
2357 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002358 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002359 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002360 if (copy != NULL)
2361 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002362 Py_DECREF(unicode);
2363 *p_unicode = copy;
2364}
2365
Victor Stinner034f6cf2011-09-30 02:26:44 +02002366PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002367_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002368{
Victor Stinner87af4f22011-11-21 23:03:47 +01002369 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002370 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002371
Victor Stinner034f6cf2011-09-30 02:26:44 +02002372 if (!PyUnicode_Check(unicode)) {
2373 PyErr_BadInternalCall();
2374 return NULL;
2375 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002376 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002377 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002378
Victor Stinner87af4f22011-11-21 23:03:47 +01002379 length = PyUnicode_GET_LENGTH(unicode);
2380 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002381 if (!copy)
2382 return NULL;
2383 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2384
Christian Heimesf051e432016-09-13 20:22:02 +02002385 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002386 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002387 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002388 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002389}
2390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391
Victor Stinnerbc603d12011-10-02 01:00:40 +02002392/* Widen Unicode objects to larger buffers. Don't write terminating null
2393 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394
2395void*
2396_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2397{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002398 Py_ssize_t len;
2399 void *result;
2400 unsigned int skind;
2401
Benjamin Petersonbac79492012-01-14 13:34:47 -05002402 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002403 return NULL;
2404
2405 len = PyUnicode_GET_LENGTH(s);
2406 skind = PyUnicode_KIND(s);
2407 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002408 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 return NULL;
2410 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002411 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002412 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002413 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002414 if (!result)
2415 return PyErr_NoMemory();
2416 assert(skind == PyUnicode_1BYTE_KIND);
2417 _PyUnicode_CONVERT_BYTES(
2418 Py_UCS1, Py_UCS2,
2419 PyUnicode_1BYTE_DATA(s),
2420 PyUnicode_1BYTE_DATA(s) + len,
2421 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002423 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002424 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002425 if (!result)
2426 return PyErr_NoMemory();
2427 if (skind == PyUnicode_2BYTE_KIND) {
2428 _PyUnicode_CONVERT_BYTES(
2429 Py_UCS2, Py_UCS4,
2430 PyUnicode_2BYTE_DATA(s),
2431 PyUnicode_2BYTE_DATA(s) + len,
2432 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002434 else {
2435 assert(skind == PyUnicode_1BYTE_KIND);
2436 _PyUnicode_CONVERT_BYTES(
2437 Py_UCS1, Py_UCS4,
2438 PyUnicode_1BYTE_DATA(s),
2439 PyUnicode_1BYTE_DATA(s) + len,
2440 result);
2441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002443 default:
2444 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Victor Stinner01698042011-10-04 00:04:26 +02002446 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 return NULL;
2448}
2449
2450static Py_UCS4*
2451as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2452 int copy_null)
2453{
2454 int kind;
2455 void *data;
2456 Py_ssize_t len, targetlen;
2457 if (PyUnicode_READY(string) == -1)
2458 return NULL;
2459 kind = PyUnicode_KIND(string);
2460 data = PyUnicode_DATA(string);
2461 len = PyUnicode_GET_LENGTH(string);
2462 targetlen = len;
2463 if (copy_null)
2464 targetlen++;
2465 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002466 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 if (!target) {
2468 PyErr_NoMemory();
2469 return NULL;
2470 }
2471 }
2472 else {
2473 if (targetsize < targetlen) {
2474 PyErr_Format(PyExc_SystemError,
2475 "string is longer than the buffer");
2476 if (copy_null && 0 < targetsize)
2477 target[0] = 0;
2478 return NULL;
2479 }
2480 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002481 if (kind == PyUnicode_1BYTE_KIND) {
2482 Py_UCS1 *start = (Py_UCS1 *) data;
2483 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002485 else if (kind == PyUnicode_2BYTE_KIND) {
2486 Py_UCS2 *start = (Py_UCS2 *) data;
2487 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2488 }
2489 else {
2490 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002491 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 if (copy_null)
2494 target[len] = 0;
2495 return target;
2496}
2497
2498Py_UCS4*
2499PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2500 int copy_null)
2501{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002502 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 PyErr_BadInternalCall();
2504 return NULL;
2505 }
2506 return as_ucs4(string, target, targetsize, copy_null);
2507}
2508
2509Py_UCS4*
2510PyUnicode_AsUCS4Copy(PyObject *string)
2511{
2512 return as_ucs4(string, NULL, 0, 1);
2513}
2514
Victor Stinner15a11362012-10-06 23:48:20 +02002515/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002516 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2517 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2518#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002519
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520static int
2521unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2522 Py_ssize_t width, Py_ssize_t precision)
2523{
2524 Py_ssize_t length, fill, arglen;
2525 Py_UCS4 maxchar;
2526
2527 if (PyUnicode_READY(str) == -1)
2528 return -1;
2529
2530 length = PyUnicode_GET_LENGTH(str);
2531 if ((precision == -1 || precision >= length)
2532 && width <= length)
2533 return _PyUnicodeWriter_WriteStr(writer, str);
2534
2535 if (precision != -1)
2536 length = Py_MIN(precision, length);
2537
2538 arglen = Py_MAX(length, width);
2539 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2540 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2541 else
2542 maxchar = writer->maxchar;
2543
2544 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2545 return -1;
2546
2547 if (width > length) {
2548 fill = width - length;
2549 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2550 return -1;
2551 writer->pos += fill;
2552 }
2553
2554 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2555 str, 0, length);
2556 writer->pos += length;
2557 return 0;
2558}
2559
2560static int
2561unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2562 Py_ssize_t width, Py_ssize_t precision)
2563{
2564 /* UTF-8 */
2565 Py_ssize_t length;
2566 PyObject *unicode;
2567 int res;
2568
2569 length = strlen(str);
2570 if (precision != -1)
2571 length = Py_MIN(length, precision);
2572 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2573 if (unicode == NULL)
2574 return -1;
2575
2576 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2577 Py_DECREF(unicode);
2578 return res;
2579}
2580
Victor Stinner96865452011-03-01 23:44:09 +00002581static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002582unicode_fromformat_arg(_PyUnicodeWriter *writer,
2583 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002584{
Victor Stinnere215d962012-10-06 23:03:36 +02002585 const char *p;
2586 Py_ssize_t len;
2587 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 Py_ssize_t width;
2589 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 int longflag;
2591 int longlongflag;
2592 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002594
2595 p = f;
2596 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002597 zeropad = 0;
2598 if (*f == '0') {
2599 zeropad = 1;
2600 f++;
2601 }
Victor Stinner96865452011-03-01 23:44:09 +00002602
2603 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 width = -1;
2605 if (Py_ISDIGIT((unsigned)*f)) {
2606 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002607 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002608 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002609 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002610 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002612 return NULL;
2613 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002615 f++;
2616 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 }
2618 precision = -1;
2619 if (*f == '.') {
2620 f++;
2621 if (Py_ISDIGIT((unsigned)*f)) {
2622 precision = (*f - '0');
2623 f++;
2624 while (Py_ISDIGIT((unsigned)*f)) {
2625 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2626 PyErr_SetString(PyExc_ValueError,
2627 "precision too big");
2628 return NULL;
2629 }
2630 precision = (precision * 10) + (*f - '0');
2631 f++;
2632 }
2633 }
Victor Stinner96865452011-03-01 23:44:09 +00002634 if (*f == '%') {
2635 /* "%.3%s" => f points to "3" */
2636 f--;
2637 }
2638 }
2639 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002640 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002641 f--;
2642 }
Victor Stinner96865452011-03-01 23:44:09 +00002643
2644 /* Handle %ld, %lu, %lld and %llu. */
2645 longflag = 0;
2646 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002647 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002648 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002649 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002650 longflag = 1;
2651 ++f;
2652 }
Victor Stinner96865452011-03-01 23:44:09 +00002653 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002654 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002655 longlongflag = 1;
2656 f += 2;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658 }
2659 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002660 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002661 size_tflag = 1;
2662 ++f;
2663 }
Victor Stinnere215d962012-10-06 23:03:36 +02002664
2665 if (f[1] == '\0')
2666 writer->overallocate = 0;
2667
2668 switch (*f) {
2669 case 'c':
2670 {
2671 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002672 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002673 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002674 "character argument not in range(0x110000)");
2675 return NULL;
2676 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002677 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002678 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002679 break;
2680 }
2681
2682 case 'i':
2683 case 'd':
2684 case 'u':
2685 case 'x':
2686 {
2687 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002688 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002689 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002690
2691 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002693 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002694 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002695 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002696 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002697 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002698 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002699 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002700 va_arg(*vargs, size_t));
2701 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002702 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002703 va_arg(*vargs, unsigned int));
2704 }
2705 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002706 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002707 }
2708 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002709 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002710 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002711 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002712 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002713 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002714 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002715 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002716 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002717 va_arg(*vargs, Py_ssize_t));
2718 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002719 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002720 va_arg(*vargs, int));
2721 }
2722 assert(len >= 0);
2723
Victor Stinnere215d962012-10-06 23:03:36 +02002724 if (precision < len)
2725 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726
2727 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2729 return NULL;
2730
Victor Stinnere215d962012-10-06 23:03:36 +02002731 if (width > precision) {
2732 Py_UCS4 fillchar;
2733 fill = width - precision;
2734 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002735 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2736 return NULL;
2737 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002738 }
Victor Stinner15a11362012-10-06 23:48:20 +02002739 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002740 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002741 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2742 return NULL;
2743 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002744 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745
Victor Stinner4a587072013-11-19 12:54:53 +01002746 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2747 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002748 break;
2749 }
2750
2751 case 'p':
2752 {
2753 char number[MAX_LONG_LONG_CHARS];
2754
2755 len = sprintf(number, "%p", va_arg(*vargs, void*));
2756 assert(len >= 0);
2757
2758 /* %p is ill-defined: ensure leading 0x. */
2759 if (number[1] == 'X')
2760 number[1] = 'x';
2761 else if (number[1] != 'x') {
2762 memmove(number + 2, number,
2763 strlen(number) + 1);
2764 number[0] = '0';
2765 number[1] = 'x';
2766 len += 2;
2767 }
2768
Victor Stinner4a587072013-11-19 12:54:53 +01002769 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return NULL;
2771 break;
2772 }
2773
2774 case 's':
2775 {
2776 /* UTF-8 */
2777 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002778 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002779 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002780 break;
2781 }
2782
2783 case 'U':
2784 {
2785 PyObject *obj = va_arg(*vargs, PyObject *);
2786 assert(obj && _PyUnicode_CHECK(obj));
2787
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002788 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002789 return NULL;
2790 break;
2791 }
2792
2793 case 'V':
2794 {
2795 PyObject *obj = va_arg(*vargs, PyObject *);
2796 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002797 if (obj) {
2798 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002799 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002800 return NULL;
2801 }
2802 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 assert(str != NULL);
2804 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002805 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002806 }
2807 break;
2808 }
2809
2810 case 'S':
2811 {
2812 PyObject *obj = va_arg(*vargs, PyObject *);
2813 PyObject *str;
2814 assert(obj);
2815 str = PyObject_Str(obj);
2816 if (!str)
2817 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002818 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002819 Py_DECREF(str);
2820 return NULL;
2821 }
2822 Py_DECREF(str);
2823 break;
2824 }
2825
2826 case 'R':
2827 {
2828 PyObject *obj = va_arg(*vargs, PyObject *);
2829 PyObject *repr;
2830 assert(obj);
2831 repr = PyObject_Repr(obj);
2832 if (!repr)
2833 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002834 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002835 Py_DECREF(repr);
2836 return NULL;
2837 }
2838 Py_DECREF(repr);
2839 break;
2840 }
2841
2842 case 'A':
2843 {
2844 PyObject *obj = va_arg(*vargs, PyObject *);
2845 PyObject *ascii;
2846 assert(obj);
2847 ascii = PyObject_ASCII(obj);
2848 if (!ascii)
2849 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002850 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002851 Py_DECREF(ascii);
2852 return NULL;
2853 }
2854 Py_DECREF(ascii);
2855 break;
2856 }
2857
2858 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002859 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002860 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002861 break;
2862
2863 default:
2864 /* if we stumble upon an unknown formatting code, copy the rest
2865 of the format string to the output string. (we cannot just
2866 skip the code, since there's no way to know what's in the
2867 argument list) */
2868 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002869 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002870 return NULL;
2871 f = p+len;
2872 return f;
2873 }
2874
2875 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002876 return f;
2877}
2878
Walter Dörwaldd2034312007-05-18 16:29:38 +00002879PyObject *
2880PyUnicode_FromFormatV(const char *format, va_list vargs)
2881{
Victor Stinnere215d962012-10-06 23:03:36 +02002882 va_list vargs2;
2883 const char *f;
2884 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002885
Victor Stinner8f674cc2013-04-17 23:02:17 +02002886 _PyUnicodeWriter_Init(&writer);
2887 writer.min_length = strlen(format) + 100;
2888 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002889
Benjamin Peterson0c212142016-09-20 20:39:33 -07002890 // Copy varags to be able to pass a reference to a subfunction.
2891 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002892
2893 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002894 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002895 f = unicode_fromformat_arg(&writer, f, &vargs2);
2896 if (f == NULL)
2897 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002900 const char *p;
2901 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002902
Victor Stinnere215d962012-10-06 23:03:36 +02002903 p = f;
2904 do
2905 {
2906 if ((unsigned char)*p > 127) {
2907 PyErr_Format(PyExc_ValueError,
2908 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2909 "string, got a non-ASCII byte: 0x%02x",
2910 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002911 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002912 }
2913 p++;
2914 }
2915 while (*p != '\0' && *p != '%');
2916 len = p - f;
2917
2918 if (*p == '\0')
2919 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002920
2921 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002922 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002923
2924 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002926 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002927 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002928 return _PyUnicodeWriter_Finish(&writer);
2929
2930 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002931 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002932 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002933 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002934}
2935
Walter Dörwaldd2034312007-05-18 16:29:38 +00002936PyObject *
2937PyUnicode_FromFormat(const char *format, ...)
2938{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002939 PyObject* ret;
2940 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941
2942#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002943 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002944#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002945 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002946#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002947 ret = PyUnicode_FromFormatV(format, vargs);
2948 va_end(vargs);
2949 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002950}
2951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952#ifdef HAVE_WCHAR_H
2953
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002954/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00002955
Victor Stinnerd88d9832011-09-06 02:00:05 +02002956 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002957 character) required to convert the unicode object. Ignore size argument.
2958
Victor Stinnerd88d9832011-09-06 02:00:05 +02002959 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002961 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002962Py_ssize_t
2963PyUnicode_AsWideChar(PyObject *unicode,
2964 wchar_t *w,
2965 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00002966{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002967 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002968 const wchar_t *wstr;
2969
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002970 if (unicode == NULL) {
2971 PyErr_BadInternalCall();
2972 return -1;
2973 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002974 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975 if (wstr == NULL)
2976 return -1;
2977
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002979 if (size > res)
2980 size = res + 1;
2981 else
2982 res = size;
Christian Heimesf051e432016-09-13 20:22:02 +02002983 memcpy(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002984 return res;
2985 }
2986 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002987 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002988}
2989
Victor Stinner137c34c2010-09-29 10:25:54 +00002990wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002991PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002992 Py_ssize_t *size)
2993{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03002994 const wchar_t *wstr;
2995 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00002996 Py_ssize_t buflen;
2997
2998 if (unicode == NULL) {
2999 PyErr_BadInternalCall();
3000 return NULL;
3001 }
3002
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003003 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3004 if (wstr == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005 return NULL;
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003006 }
3007 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3008 PyErr_SetString(PyExc_ValueError,
3009 "embedded null character");
3010 return NULL;
3011 }
3012
3013 buffer = PyMem_NEW(wchar_t, buflen + 1);
Victor Stinner137c34c2010-09-29 10:25:54 +00003014 if (buffer == NULL) {
3015 PyErr_NoMemory();
3016 return NULL;
3017 }
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003018 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00003019 if (size != NULL)
3020 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00003021 return buffer;
3022}
3023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025
Alexander Belopolsky40018472011-02-26 01:02:56 +00003026PyObject *
3027PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003028{
Victor Stinner8faf8212011-12-08 22:14:11 +01003029 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 PyErr_SetString(PyExc_ValueError,
3031 "chr() arg not in range(0x110000)");
3032 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003033 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003034
Victor Stinner985a82a2014-01-03 12:53:47 +01003035 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003036}
3037
Alexander Belopolsky40018472011-02-26 01:02:56 +00003038PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003039PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003041 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003043 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003044 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003045 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003046 Py_INCREF(obj);
3047 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003048 }
3049 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 /* For a Unicode subtype that's not a Unicode object,
3051 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003052 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003053 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003054 PyErr_Format(PyExc_TypeError,
3055 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003056 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003057 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003058}
3059
Alexander Belopolsky40018472011-02-26 01:02:56 +00003060PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003061PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003062 const char *encoding,
3063 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003064{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003065 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003066 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003067
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 PyErr_BadInternalCall();
3070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003072
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003073 /* Decoding bytes objects is the most common case and should be fast */
3074 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003075 if (PyBytes_GET_SIZE(obj) == 0)
3076 _Py_RETURN_UNICODE_EMPTY();
3077 v = PyUnicode_Decode(
3078 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3079 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003080 return v;
3081 }
3082
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003083 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 PyErr_SetString(PyExc_TypeError,
3085 "decoding str is not supported");
3086 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003088
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003089 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3090 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3091 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003092 "decoding to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003093 Py_TYPE(obj)->tp_name);
3094 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003095 }
Tim Petersced69f82003-09-16 20:30:58 +00003096
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003097 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003098 PyBuffer_Release(&buffer);
3099 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003101
Serhiy Storchaka05997252013-01-26 12:14:02 +02003102 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003103 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003104 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105}
3106
Victor Stinnerebe17e02016-10-12 13:57:45 +02003107/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3108 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3109 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003110int
3111_Py_normalize_encoding(const char *encoding,
3112 char *lower,
3113 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003115 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003116 char *l;
3117 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003118 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003119
Victor Stinner942889a2016-09-05 15:40:10 -07003120 assert(encoding != NULL);
3121
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003122 e = encoding;
3123 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003124 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003125 punct = 0;
3126 while (1) {
3127 char c = *e;
3128 if (c == 0) {
3129 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003130 }
Victor Stinner942889a2016-09-05 15:40:10 -07003131
3132 if (Py_ISALNUM(c) || c == '.') {
3133 if (punct && l != lower) {
3134 if (l == l_end) {
3135 return 0;
3136 }
3137 *l++ = '_';
3138 }
3139 punct = 0;
3140
3141 if (l == l_end) {
3142 return 0;
3143 }
3144 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003145 }
3146 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003147 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003148 }
Victor Stinner942889a2016-09-05 15:40:10 -07003149
3150 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003151 }
3152 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003153 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003154}
3155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156PyObject *
3157PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003158 Py_ssize_t size,
3159 const char *encoding,
3160 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003161{
3162 PyObject *buffer = NULL, *unicode;
3163 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003164 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3165
3166 if (encoding == NULL) {
3167 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3168 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003169
Fred Drakee4315f52000-05-09 19:53:39 +00003170 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003171 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3172 char *lower = buflower;
3173
3174 /* Fast paths */
3175 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3176 lower += 3;
3177 if (*lower == '_') {
3178 /* Match "utf8" and "utf_8" */
3179 lower++;
3180 }
3181
3182 if (lower[0] == '8' && lower[1] == 0) {
3183 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3184 }
3185 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3186 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3187 }
3188 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3189 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3190 }
3191 }
3192 else {
3193 if (strcmp(lower, "ascii") == 0
3194 || strcmp(lower, "us_ascii") == 0) {
3195 return PyUnicode_DecodeASCII(s, size, errors);
3196 }
Steve Dowercc16be82016-09-08 10:35:16 -07003197 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003198 else if (strcmp(lower, "mbcs") == 0) {
3199 return PyUnicode_DecodeMBCS(s, size, errors);
3200 }
3201 #endif
3202 else if (strcmp(lower, "latin1") == 0
3203 || strcmp(lower, "latin_1") == 0
3204 || strcmp(lower, "iso_8859_1") == 0
3205 || strcmp(lower, "iso8859_1") == 0) {
3206 return PyUnicode_DecodeLatin1(s, size, errors);
3207 }
3208 }
Victor Stinner37296e82010-06-10 13:36:23 +00003209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210
3211 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003212 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003213 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003214 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003215 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 if (buffer == NULL)
3217 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003218 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 if (unicode == NULL)
3220 goto onError;
3221 if (!PyUnicode_Check(unicode)) {
3222 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003223 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3224 "use codecs.decode() to decode to arbitrary types",
3225 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003226 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227 Py_DECREF(unicode);
3228 goto onError;
3229 }
3230 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003231 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 Py_XDECREF(buffer);
3235 return NULL;
3236}
3237
Alexander Belopolsky40018472011-02-26 01:02:56 +00003238PyObject *
3239PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003240 const char *encoding,
3241 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003243 if (!PyUnicode_Check(unicode)) {
3244 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003245 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246 }
3247
Serhiy Storchaka00939072016-10-27 21:05:49 +03003248 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3249 "PyUnicode_AsDecodedObject() is deprecated; "
3250 "use PyCodec_Decode() to decode from str", 1) < 0)
3251 return NULL;
3252
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003253 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003255
3256 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003257 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003258}
3259
Alexander Belopolsky40018472011-02-26 01:02:56 +00003260PyObject *
3261PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003262 const char *encoding,
3263 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003264{
3265 PyObject *v;
3266
3267 if (!PyUnicode_Check(unicode)) {
3268 PyErr_BadArgument();
3269 goto onError;
3270 }
3271
Serhiy Storchaka00939072016-10-27 21:05:49 +03003272 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3273 "PyUnicode_AsDecodedUnicode() is deprecated; "
3274 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3275 return NULL;
3276
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003277 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003279
3280 /* Decode via the codec registry */
3281 v = PyCodec_Decode(unicode, encoding, errors);
3282 if (v == NULL)
3283 goto onError;
3284 if (!PyUnicode_Check(v)) {
3285 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003286 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3287 "use codecs.decode() to decode to arbitrary types",
3288 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003289 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003290 Py_DECREF(v);
3291 goto onError;
3292 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003293 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003294
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003296 return NULL;
3297}
3298
Alexander Belopolsky40018472011-02-26 01:02:56 +00003299PyObject *
3300PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003301 Py_ssize_t size,
3302 const char *encoding,
3303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304{
3305 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003306
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003307 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3311 Py_DECREF(unicode);
3312 return v;
3313}
3314
Alexander Belopolsky40018472011-02-26 01:02:56 +00003315PyObject *
3316PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003317 const char *encoding,
3318 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003319{
3320 PyObject *v;
3321
3322 if (!PyUnicode_Check(unicode)) {
3323 PyErr_BadArgument();
3324 goto onError;
3325 }
3326
Serhiy Storchaka00939072016-10-27 21:05:49 +03003327 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3328 "PyUnicode_AsEncodedObject() is deprecated; "
3329 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3330 "or PyCodec_Encode() for generic encoding", 1) < 0)
3331 return NULL;
3332
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003333 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003335
3336 /* Encode via the codec registry */
3337 v = PyCodec_Encode(unicode, encoding, errors);
3338 if (v == NULL)
3339 goto onError;
3340 return v;
3341
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003343 return NULL;
3344}
3345
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003346static size_t
3347wcstombs_errorpos(const wchar_t *wstr)
3348{
3349 size_t len;
3350#if SIZEOF_WCHAR_T == 2
3351 wchar_t buf[3];
3352#else
3353 wchar_t buf[2];
3354#endif
3355 char outbuf[MB_LEN_MAX];
3356 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003357
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003358#if SIZEOF_WCHAR_T == 2
3359 buf[2] = 0;
3360#else
3361 buf[1] = 0;
3362#endif
3363 start = wstr;
3364 while (*wstr != L'\0')
3365 {
3366 previous = wstr;
3367#if SIZEOF_WCHAR_T == 2
3368 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3369 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3370 {
3371 buf[0] = wstr[0];
3372 buf[1] = wstr[1];
3373 wstr += 2;
3374 }
3375 else {
3376 buf[0] = *wstr;
3377 buf[1] = 0;
3378 wstr++;
3379 }
3380#else
3381 buf[0] = *wstr;
3382 wstr++;
3383#endif
3384 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003385 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003386 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003387 }
3388
3389 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003390 return 0;
3391}
3392
Victor Stinner1b579672011-12-17 05:47:23 +01003393static int
3394locale_error_handler(const char *errors, int *surrogateescape)
3395{
Victor Stinner50149202015-09-22 00:26:54 +02003396 _Py_error_handler error_handler = get_error_handler(errors);
3397 switch (error_handler)
3398 {
3399 case _Py_ERROR_STRICT:
Victor Stinner1b579672011-12-17 05:47:23 +01003400 *surrogateescape = 0;
3401 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003402 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner1b579672011-12-17 05:47:23 +01003403 *surrogateescape = 1;
3404 return 0;
Victor Stinner50149202015-09-22 00:26:54 +02003405 default:
3406 PyErr_Format(PyExc_ValueError,
3407 "only 'strict' and 'surrogateescape' error handlers "
3408 "are supported, not '%s'",
3409 errors);
3410 return -1;
Victor Stinner1b579672011-12-17 05:47:23 +01003411 }
Victor Stinner1b579672011-12-17 05:47:23 +01003412}
3413
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003414PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003415PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003416{
3417 Py_ssize_t wlen, wlen2;
3418 wchar_t *wstr;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003420 PyObject *bytes, *reason, *exc;
3421 size_t error_pos, errlen;
Victor Stinner1b579672011-12-17 05:47:23 +01003422 int surrogateescape;
3423
3424 if (locale_error_handler(errors, &surrogateescape) < 0)
3425 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003426
3427 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3428 if (wstr == NULL)
3429 return NULL;
3430
3431 wlen2 = wcslen(wstr);
3432 if (wlen2 != wlen) {
3433 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003434 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003435 return NULL;
3436 }
3437
3438 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003439 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003440 char *str;
3441
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003442 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003443 if (str == NULL) {
3444 if (error_pos == (size_t)-1) {
3445 PyErr_NoMemory();
3446 PyMem_Free(wstr);
3447 return NULL;
3448 }
3449 else {
3450 goto encode_error;
3451 }
3452 }
3453 PyMem_Free(wstr);
3454
3455 bytes = PyBytes_FromString(str);
3456 PyMem_Free(str);
3457 }
3458 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003459 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003460 size_t len, len2;
3461
3462 len = wcstombs(NULL, wstr, 0);
3463 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003464 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003465 goto encode_error;
3466 }
3467
3468 bytes = PyBytes_FromStringAndSize(NULL, len);
3469 if (bytes == NULL) {
3470 PyMem_Free(wstr);
3471 return NULL;
3472 }
3473
3474 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3475 if (len2 == (size_t)-1 || len2 > len) {
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003476 Py_DECREF(bytes);
Victor Stinner2f197072011-12-17 07:08:30 +01003477 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003478 goto encode_error;
3479 }
3480 PyMem_Free(wstr);
3481 }
3482 return bytes;
3483
3484encode_error:
3485 errmsg = strerror(errno);
3486 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003487
3488 if (error_pos == (size_t)-1)
3489 error_pos = wcstombs_errorpos(wstr);
3490
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003492
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003493 wstr = Py_DecodeLocale(errmsg, &errlen);
3494 if (wstr != NULL) {
3495 reason = PyUnicode_FromWideChar(wstr, errlen);
3496 PyMem_RawFree(wstr);
3497 } else {
3498 errmsg = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003499 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003500
Victor Stinner2f197072011-12-17 07:08:30 +01003501 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003502 reason = PyUnicode_FromString(
3503 "wcstombs() encountered an unencodable "
3504 "wide character");
3505 if (reason == NULL)
3506 return NULL;
3507
3508 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3509 "locale", unicode,
3510 (Py_ssize_t)error_pos,
3511 (Py_ssize_t)(error_pos+1),
3512 reason);
3513 Py_DECREF(reason);
3514 if (exc != NULL) {
3515 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003516 Py_DECREF(exc);
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003517 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003518 return NULL;
3519}
3520
Victor Stinnerad158722010-10-27 00:25:46 +00003521PyObject *
3522PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003523{
Steve Dowercc16be82016-09-08 10:35:16 -07003524#if defined(__APPLE__)
3525 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerad158722010-10-27 00:25:46 +00003526#else
Victor Stinner793b5312011-04-27 00:24:21 +02003527 PyInterpreterState *interp = PyThreadState_GET()->interp;
3528 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3529 cannot use it to encode and decode filenames before it is loaded. Load
3530 the Python codec requires to encode at least its own filename. Use the C
3531 version of the locale codec until the codec registry is initialized and
3532 the Python codec is loaded.
3533
3534 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3535 cannot only rely on it: check also interp->fscodec_initialized for
3536 subinterpreters. */
3537 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003538 return PyUnicode_AsEncodedString(unicode,
3539 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003540 Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003541 }
3542 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003543 return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003544 }
Victor Stinnerad158722010-10-27 00:25:46 +00003545#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003546}
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548PyObject *
3549PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003550 const char *encoding,
3551 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552{
3553 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003554 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003555
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 if (!PyUnicode_Check(unicode)) {
3557 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 }
Fred Drakee4315f52000-05-09 19:53:39 +00003560
Victor Stinner942889a2016-09-05 15:40:10 -07003561 if (encoding == NULL) {
3562 return _PyUnicode_AsUTF8String(unicode, errors);
3563 }
3564
Fred Drakee4315f52000-05-09 19:53:39 +00003565 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003566 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3567 char *lower = buflower;
3568
3569 /* Fast paths */
3570 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3571 lower += 3;
3572 if (*lower == '_') {
3573 /* Match "utf8" and "utf_8" */
3574 lower++;
3575 }
3576
3577 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003578 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003579 }
3580 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3581 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3582 }
3583 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3584 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3585 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003586 }
Victor Stinner942889a2016-09-05 15:40:10 -07003587 else {
3588 if (strcmp(lower, "ascii") == 0
3589 || strcmp(lower, "us_ascii") == 0) {
3590 return _PyUnicode_AsASCIIString(unicode, errors);
3591 }
Steve Dowercc16be82016-09-08 10:35:16 -07003592#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003593 else if (strcmp(lower, "mbcs") == 0) {
3594 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3595 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003596#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003597 else if (strcmp(lower, "latin1") == 0 ||
3598 strcmp(lower, "latin_1") == 0 ||
3599 strcmp(lower, "iso_8859_1") == 0 ||
3600 strcmp(lower, "iso8859_1") == 0) {
3601 return _PyUnicode_AsLatin1String(unicode, errors);
3602 }
3603 }
Victor Stinner37296e82010-06-10 13:36:23 +00003604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003605
3606 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003607 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003609 return NULL;
3610
3611 /* The normal path */
3612 if (PyBytes_Check(v))
3613 return v;
3614
3615 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003616 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003617 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003618 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003619
3620 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003621 "encoder %s returned bytearray instead of bytes; "
3622 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003623 encoding);
3624 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003625 Py_DECREF(v);
3626 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003628
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003629 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3630 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003631 Py_DECREF(v);
3632 return b;
3633 }
3634
3635 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003636 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3637 "use codecs.encode() to encode to arbitrary types",
3638 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003639 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003641 return NULL;
3642}
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644PyObject *
3645PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 const char *encoding,
3647 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003648{
3649 PyObject *v;
3650
3651 if (!PyUnicode_Check(unicode)) {
3652 PyErr_BadArgument();
3653 goto onError;
3654 }
3655
Serhiy Storchaka00939072016-10-27 21:05:49 +03003656 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3657 "PyUnicode_AsEncodedUnicode() is deprecated; "
3658 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3659 return NULL;
3660
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003663
3664 /* Encode via the codec registry */
3665 v = PyCodec_Encode(unicode, encoding, errors);
3666 if (v == NULL)
3667 goto onError;
3668 if (!PyUnicode_Check(v)) {
3669 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003670 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3671 "use codecs.encode() to encode to arbitrary types",
3672 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003673 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003674 Py_DECREF(v);
3675 goto onError;
3676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 return NULL;
3681}
3682
Victor Stinner2f197072011-12-17 07:08:30 +01003683static size_t
3684mbstowcs_errorpos(const char *str, size_t len)
3685{
3686#ifdef HAVE_MBRTOWC
3687 const char *start = str;
3688 mbstate_t mbs;
3689 size_t converted;
3690 wchar_t ch;
3691
3692 memset(&mbs, 0, sizeof mbs);
3693 while (len)
3694 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003695 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003696 if (converted == 0)
3697 /* Reached end of string */
3698 break;
3699 if (converted == (size_t)-1 || converted == (size_t)-2) {
3700 /* Conversion error or incomplete character */
3701 return str - start;
3702 }
3703 else {
3704 str += converted;
3705 len -= converted;
3706 }
3707 }
3708 /* failed to find the undecodable byte sequence */
3709 return 0;
3710#endif
3711 return 0;
3712}
3713
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003714PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003716 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
3718 wchar_t smallbuf[256];
3719 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3720 wchar_t *wstr;
3721 size_t wlen, wlen2;
3722 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003723 int surrogateescape;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003724 size_t error_pos, errlen;
Victor Stinner2f197072011-12-17 07:08:30 +01003725 char *errmsg;
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003726 PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
Victor Stinner1b579672011-12-17 05:47:23 +01003727
3728 if (locale_error_handler(errors, &surrogateescape) < 0)
3729 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003730
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003731 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3732 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003733 return NULL;
3734 }
3735
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003736 if (surrogateescape) {
3737 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003738 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003739 if (wstr == NULL) {
3740 if (wlen == (size_t)-1)
3741 PyErr_NoMemory();
3742 else
3743 PyErr_SetFromErrno(PyExc_OSError);
3744 return NULL;
3745 }
3746
3747 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003748 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003749 }
3750 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003751 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003752#ifndef HAVE_BROKEN_MBSTOWCS
3753 wlen = mbstowcs(NULL, str, 0);
3754#else
3755 wlen = len;
3756#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003757 if (wlen == (size_t)-1)
3758 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003759 if (wlen+1 <= smallbuf_len) {
3760 wstr = smallbuf;
3761 }
3762 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003763 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003764 if (!wstr)
3765 return PyErr_NoMemory();
3766 }
3767
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003768 wlen2 = mbstowcs(wstr, str, wlen+1);
3769 if (wlen2 == (size_t)-1) {
3770 if (wstr != smallbuf)
3771 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003772 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003773 }
3774#ifdef HAVE_BROKEN_MBSTOWCS
3775 assert(wlen2 == wlen);
3776#endif
3777 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3778 if (wstr != smallbuf)
3779 PyMem_Free(wstr);
3780 }
3781 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003782
3783decode_error:
3784 errmsg = strerror(errno);
3785 assert(errmsg != NULL);
3786
3787 error_pos = mbstowcs_errorpos(str, len);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003788 wstr = Py_DecodeLocale(errmsg, &errlen);
3789 if (wstr != NULL) {
3790 reason = PyUnicode_FromWideChar(wstr, errlen);
3791 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003792 }
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003793
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003794 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003795 reason = PyUnicode_FromString(
3796 "mbstowcs() encountered an invalid multibyte sequence");
3797 if (reason == NULL)
3798 return NULL;
3799
3800 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3801 "locale", str, len,
3802 (Py_ssize_t)error_pos,
3803 (Py_ssize_t)(error_pos+1),
3804 reason);
3805 Py_DECREF(reason);
3806 if (exc != NULL) {
3807 PyCodec_StrictErrors(exc);
Serhiy Storchaka2fbc0192016-10-23 15:41:36 +03003808 Py_DECREF(exc);
Victor Stinner2f197072011-12-17 07:08:30 +01003809 }
3810 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003811}
3812
3813PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003814PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003815{
3816 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003817 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003818}
3819
3820
3821PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003822PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003823 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003824 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3825}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003826
Christian Heimes5894ba72007-11-04 11:43:14 +00003827PyObject*
3828PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3829{
Steve Dowercc16be82016-09-08 10:35:16 -07003830#if defined(__APPLE__)
3831 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003832#else
Victor Stinner793b5312011-04-27 00:24:21 +02003833 PyInterpreterState *interp = PyThreadState_GET()->interp;
3834 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3835 cannot use it to encode and decode filenames before it is loaded. Load
3836 the Python codec requires to encode at least its own filename. Use the C
3837 version of the locale codec until the codec registry is initialized and
3838 the Python codec is loaded.
3839
3840 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3841 cannot only rely on it: check also interp->fscodec_initialized for
3842 subinterpreters. */
3843 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Steve Dower78057b42016-11-06 19:35:08 -08003844 return PyUnicode_Decode(s, size,
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845 Py_FileSystemDefaultEncoding,
Steve Dowercc16be82016-09-08 10:35:16 -07003846 Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847 }
3848 else {
Steve Dowercc16be82016-09-08 10:35:16 -07003849 return PyUnicode_DecodeLocaleAndSize(s, size, Py_FileSystemDefaultEncodeErrors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003850 }
Victor Stinnerad158722010-10-27 00:25:46 +00003851#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852}
3853
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854
3855int
3856PyUnicode_FSConverter(PyObject* arg, void* addr)
3857{
Brett Cannonec6ce872016-09-06 15:50:29 -07003858 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003859 PyObject *output = NULL;
3860 Py_ssize_t size;
3861 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003862 if (arg == NULL) {
3863 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003864 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003865 return 1;
3866 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003867 path = PyOS_FSPath(arg);
3868 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003869 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003870 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003871 if (PyBytes_Check(path)) {
3872 output = path;
3873 }
3874 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3875 output = PyUnicode_EncodeFSDefault(path);
3876 Py_DECREF(path);
3877 if (!output) {
3878 return 0;
3879 }
3880 assert(PyBytes_Check(output));
3881 }
3882
Victor Stinner0ea2a462010-04-30 00:22:08 +00003883 size = PyBytes_GET_SIZE(output);
3884 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003885 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003886 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887 Py_DECREF(output);
3888 return 0;
3889 }
3890 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003892}
3893
3894
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003895int
3896PyUnicode_FSDecoder(PyObject* arg, void* addr)
3897{
Brett Cannona5711202016-09-06 19:36:01 -07003898 int is_buffer = 0;
3899 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003900 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003901 if (arg == NULL) {
3902 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003903 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003904 return 1;
3905 }
Brett Cannona5711202016-09-06 19:36:01 -07003906
3907 is_buffer = PyObject_CheckBuffer(arg);
3908 if (!is_buffer) {
3909 path = PyOS_FSPath(arg);
3910 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003911 return 0;
3912 }
Brett Cannona5711202016-09-06 19:36:01 -07003913 }
3914 else {
3915 path = arg;
3916 Py_INCREF(arg);
3917 }
3918
3919 if (PyUnicode_Check(path)) {
3920 if (PyUnicode_READY(path) == -1) {
3921 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003922 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003923 }
3924 output = path;
3925 }
3926 else if (PyBytes_Check(path) || is_buffer) {
3927 PyObject *path_bytes = NULL;
3928
3929 if (!PyBytes_Check(path) &&
3930 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3931 "path should be string, bytes, or os.PathLike, not %.200s",
3932 Py_TYPE(arg)->tp_name)) {
3933 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003934 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003935 }
3936 path_bytes = PyBytes_FromObject(path);
3937 Py_DECREF(path);
3938 if (!path_bytes) {
3939 return 0;
3940 }
3941 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3942 PyBytes_GET_SIZE(path_bytes));
3943 Py_DECREF(path_bytes);
3944 if (!output) {
3945 return 0;
3946 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003947 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003948 else {
3949 PyErr_Format(PyExc_TypeError,
Brett Cannona5711202016-09-06 19:36:01 -07003950 "path should be string, bytes, or os.PathLike, not %.200s",
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003951 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003952 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003953 return 0;
3954 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003955 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003956 Py_DECREF(output);
3957 return 0;
3958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003960 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003961 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003962 Py_DECREF(output);
3963 return 0;
3964 }
3965 *(PyObject**)addr = output;
3966 return Py_CLEANUP_SUPPORTED;
3967}
3968
3969
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003970const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003972{
Christian Heimesf3863112007-11-22 07:46:41 +00003973 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003975 if (!PyUnicode_Check(unicode)) {
3976 PyErr_BadArgument();
3977 return NULL;
3978 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003979 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003982 if (PyUnicode_UTF8(unicode) == NULL) {
3983 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003984 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 if (bytes == NULL)
3986 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003987 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3988 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003989 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 Py_DECREF(bytes);
3991 return NULL;
3992 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003993 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003994 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003995 PyBytes_AS_STRING(bytes),
3996 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997 Py_DECREF(bytes);
3998 }
3999
4000 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004001 *psize = PyUnicode_UTF8_LENGTH(unicode);
4002 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004003}
4004
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004005const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4009}
4010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011Py_UNICODE *
4012PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 const unsigned char *one_byte;
4015#if SIZEOF_WCHAR_T == 4
4016 const Py_UCS2 *two_bytes;
4017#else
4018 const Py_UCS4 *four_bytes;
4019 const Py_UCS4 *ucs4_end;
4020 Py_ssize_t num_surrogates;
4021#endif
4022 wchar_t *w;
4023 wchar_t *wchar_end;
4024
4025 if (!PyUnicode_Check(unicode)) {
4026 PyErr_BadArgument();
4027 return NULL;
4028 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004029 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 assert(_PyUnicode_KIND(unicode) != 0);
4032 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004036 four_bytes = PyUnicode_4BYTE_DATA(unicode);
4037 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 num_surrogates = 0;
4039
4040 for (; four_bytes < ucs4_end; ++four_bytes) {
4041 if (*four_bytes > 0xFFFF)
4042 ++num_surrogates;
4043 }
4044
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004045 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
4046 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
4047 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 PyErr_NoMemory();
4049 return NULL;
4050 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004051 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 w = _PyUnicode_WSTR(unicode);
4054 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
4055 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
4057 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004058 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01004060 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
4061 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 }
4063 else
4064 *w = *four_bytes;
4065
4066 if (w > wchar_end) {
Barry Warsawb2e57942017-09-14 18:13:16 -07004067 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 }
4069 }
4070 *w = 0;
4071#else
4072 /* sizeof(wchar_t) == 4 */
4073 Py_FatalError("Impossible unicode object state, wstr and str "
4074 "should share memory already.");
4075 return NULL;
4076#endif
4077 }
4078 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02004079 if ((size_t)_PyUnicode_LENGTH(unicode) >
4080 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4081 PyErr_NoMemory();
4082 return NULL;
4083 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004084 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
4085 (_PyUnicode_LENGTH(unicode) + 1));
4086 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087 PyErr_NoMemory();
4088 return NULL;
4089 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004090 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
4091 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
4092 w = _PyUnicode_WSTR(unicode);
4093 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004095 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
4096 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 for (; w < wchar_end; ++one_byte, ++w)
4098 *w = *one_byte;
4099 /* null-terminate the wstr */
4100 *w = 0;
4101 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004102 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004103#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 for (; w < wchar_end; ++two_bytes, ++w)
4106 *w = *two_bytes;
4107 /* null-terminate the wstr */
4108 *w = 0;
4109#else
4110 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004111 PyObject_FREE(_PyUnicode_WSTR(unicode));
4112 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 Py_FatalError("Impossible unicode object state, wstr "
4114 "and str should share memory already.");
4115 return NULL;
4116#endif
4117 }
4118 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07004119 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 }
4121 }
4122 }
4123 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004124 *size = PyUnicode_WSTR_LENGTH(unicode);
4125 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004126}
4127
Alexander Belopolsky40018472011-02-26 01:02:56 +00004128Py_UNICODE *
4129PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132}
4133
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004134const Py_UNICODE *
4135_PyUnicode_AsUnicode(PyObject *unicode)
4136{
4137 Py_ssize_t size;
4138 const Py_UNICODE *wstr;
4139
4140 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4141 if (wstr && wcslen(wstr) != (size_t)size) {
4142 PyErr_SetString(PyExc_ValueError, "embedded null character");
4143 return NULL;
4144 }
4145 return wstr;
4146}
4147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148
Alexander Belopolsky40018472011-02-26 01:02:56 +00004149Py_ssize_t
4150PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151{
4152 if (!PyUnicode_Check(unicode)) {
4153 PyErr_BadArgument();
4154 goto onError;
4155 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004156 if (_PyUnicode_WSTR(unicode) == NULL) {
4157 if (PyUnicode_AsUnicode(unicode) == NULL)
4158 goto onError;
4159 }
4160 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 return -1;
4164}
4165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166Py_ssize_t
4167PyUnicode_GetLength(PyObject *unicode)
4168{
Victor Stinner07621332012-06-16 04:53:46 +02004169 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 PyErr_BadArgument();
4171 return -1;
4172 }
Victor Stinner07621332012-06-16 04:53:46 +02004173 if (PyUnicode_READY(unicode) == -1)
4174 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004175 return PyUnicode_GET_LENGTH(unicode);
4176}
4177
4178Py_UCS4
4179PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4180{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004181 void *data;
4182 int kind;
4183
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004184 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004185 PyErr_BadArgument();
4186 return (Py_UCS4)-1;
4187 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004188 if (PyUnicode_READY(unicode) == -1) {
4189 return (Py_UCS4)-1;
4190 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004191 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004192 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 return (Py_UCS4)-1;
4194 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004195 data = PyUnicode_DATA(unicode);
4196 kind = PyUnicode_KIND(unicode);
4197 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198}
4199
4200int
4201PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4202{
4203 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004204 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 return -1;
4206 }
Victor Stinner488fa492011-12-12 00:01:39 +01004207 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004208 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004209 PyErr_SetString(PyExc_IndexError, "string index out of range");
4210 return -1;
4211 }
Victor Stinner488fa492011-12-12 00:01:39 +01004212 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004213 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004214 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4215 PyErr_SetString(PyExc_ValueError, "character out of range");
4216 return -1;
4217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4219 index, ch);
4220 return 0;
4221}
4222
Alexander Belopolsky40018472011-02-26 01:02:56 +00004223const char *
4224PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004225{
Victor Stinner42cb4622010-09-01 19:39:01 +00004226 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004227}
4228
Victor Stinner554f3f02010-06-16 23:33:54 +00004229/* create or adjust a UnicodeDecodeError */
4230static void
4231make_decode_exception(PyObject **exceptionObject,
4232 const char *encoding,
4233 const char *input, Py_ssize_t length,
4234 Py_ssize_t startpos, Py_ssize_t endpos,
4235 const char *reason)
4236{
4237 if (*exceptionObject == NULL) {
4238 *exceptionObject = PyUnicodeDecodeError_Create(
4239 encoding, input, length, startpos, endpos, reason);
4240 }
4241 else {
4242 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4243 goto onError;
4244 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4245 goto onError;
4246 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4247 goto onError;
4248 }
4249 return;
4250
4251onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004252 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004253}
4254
Steve Dowercc16be82016-09-08 10:35:16 -07004255#ifdef MS_WINDOWS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256/* error handling callback helper:
4257 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004258 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 and adjust various state variables.
4260 return 0 on success, -1 on error
4261*/
4262
Alexander Belopolsky40018472011-02-26 01:02:56 +00004263static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264unicode_decode_call_errorhandler_wchar(
4265 const char *errors, PyObject **errorHandler,
4266 const char *encoding, const char *reason,
4267 const char **input, const char **inend, Py_ssize_t *startinpos,
4268 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004271 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272
4273 PyObject *restuple = NULL;
4274 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004275 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004276 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 Py_ssize_t requiredsize;
4278 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004279 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 wchar_t *repwstr;
4281 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4284 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004285
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 *errorHandler = PyCodec_LookupError(errors);
4288 if (*errorHandler == NULL)
4289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 }
4291
Victor Stinner554f3f02010-06-16 23:33:54 +00004292 make_decode_exception(exceptionObject,
4293 encoding,
4294 *input, *inend - *input,
4295 *startinpos, *endinpos,
4296 reason);
4297 if (*exceptionObject == NULL)
4298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004299
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004300 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004302 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004304 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004307 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309
4310 /* Copy back the bytes variables, which might have been modified by the
4311 callback */
4312 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313 if (!inputobj)
4314 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 *input = PyBytes_AS_STRING(inputobj);
4316 insize = PyBytes_GET_SIZE(inputobj);
4317 *inend = *input + insize;
4318 /* we can DECREF safely, as the exception has another reference,
4319 so the object won't go away. */
4320 Py_DECREF(inputobj);
4321
4322 if (newpos<0)
4323 newpos = insize+newpos;
4324 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004325 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 goto onError;
4327 }
4328
4329 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4330 if (repwstr == NULL)
4331 goto onError;
4332 /* need more space? (at least enough for what we
4333 have+the replacement+the rest of the string (starting
4334 at the new input position), so we won't have to check space
4335 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004336 requiredsize = *outpos;
4337 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4338 goto overflow;
4339 requiredsize += repwlen;
4340 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4341 goto overflow;
4342 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004344 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 requiredsize = 2*outsize;
4346 if (unicode_resize(output, requiredsize) < 0)
4347 goto onError;
4348 }
4349 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4350 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 *endinpos = newpos;
4352 *inptr = *input + newpos;
4353
4354 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004355 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 return 0;
4357
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004358 overflow:
4359 PyErr_SetString(PyExc_OverflowError,
4360 "decoded result is too long for a Python string");
4361
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 onError:
4363 Py_XDECREF(restuple);
4364 return -1;
4365}
Steve Dowercc16be82016-09-08 10:35:16 -07004366#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367
4368static int
4369unicode_decode_call_errorhandler_writer(
4370 const char *errors, PyObject **errorHandler,
4371 const char *encoding, const char *reason,
4372 const char **input, const char **inend, Py_ssize_t *startinpos,
4373 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4374 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4375{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004376 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377
4378 PyObject *restuple = NULL;
4379 PyObject *repunicode = NULL;
4380 Py_ssize_t insize;
4381 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004382 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004383 PyObject *inputobj = NULL;
4384
4385 if (*errorHandler == NULL) {
4386 *errorHandler = PyCodec_LookupError(errors);
4387 if (*errorHandler == NULL)
4388 goto onError;
4389 }
4390
4391 make_decode_exception(exceptionObject,
4392 encoding,
4393 *input, *inend - *input,
4394 *startinpos, *endinpos,
4395 reason);
4396 if (*exceptionObject == NULL)
4397 goto onError;
4398
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004399 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004400 if (restuple == NULL)
4401 goto onError;
4402 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 goto onError;
4405 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004406 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004408
4409 /* Copy back the bytes variables, which might have been modified by the
4410 callback */
4411 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412 if (!inputobj)
4413 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004414 *input = PyBytes_AS_STRING(inputobj);
4415 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004416 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004417 /* we can DECREF safely, as the exception has another reference,
4418 so the object won't go away. */
4419 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004423 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004424 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427
Victor Stinner170ca6f2013-04-18 00:25:28 +02004428 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004429 if (replen > 1) {
4430 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004431 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004432 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4433 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4434 goto onError;
4435 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004437 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004440 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004443 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004445
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449}
4450
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451/* --- UTF-7 Codec -------------------------------------------------------- */
4452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453/* See RFC2152 for details. We encode conservatively and decode liberally. */
4454
4455/* Three simple macros defining base-64. */
4456
4457/* Is c a base-64 character? */
4458
4459#define IS_BASE64(c) \
4460 (((c) >= 'A' && (c) <= 'Z') || \
4461 ((c) >= 'a' && (c) <= 'z') || \
4462 ((c) >= '0' && (c) <= '9') || \
4463 (c) == '+' || (c) == '/')
4464
4465/* given that c is a base-64 character, what is its base-64 value? */
4466
4467#define FROM_BASE64(c) \
4468 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4469 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4470 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4471 (c) == '+' ? 62 : 63)
4472
4473/* What is the base-64 character of the bottom 6 bits of n? */
4474
4475#define TO_BASE64(n) \
4476 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4477
4478/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4479 * decoded as itself. We are permissive on decoding; the only ASCII
4480 * byte not decoding to itself is the + which begins a base64
4481 * string. */
4482
4483#define DECODE_DIRECT(c) \
4484 ((c) <= 127 && (c) != '+')
4485
4486/* The UTF-7 encoder treats ASCII characters differently according to
4487 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4488 * the above). See RFC2152. This array identifies these different
4489 * sets:
4490 * 0 : "Set D"
4491 * alphanumeric and '(),-./:?
4492 * 1 : "Set O"
4493 * !"#$%&*;<=>@[]^_`{|}
4494 * 2 : "whitespace"
4495 * ht nl cr sp
4496 * 3 : special (must be base64 encoded)
4497 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4498 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499
Tim Petersced69f82003-09-16 20:30:58 +00004500static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501char utf7_category[128] = {
4502/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4503 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4504/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4505 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4506/* sp ! " # $ % & ' ( ) * + , - . / */
4507 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4508/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4509 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4510/* @ A B C D E F G H I J K L M N O */
4511 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4512/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4513 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4514/* ` a b c d e f g h i j k l m n o */
4515 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4516/* p q r s t u v w x y z { | } ~ del */
4517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518};
4519
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520/* ENCODE_DIRECT: this character should be encoded as itself. The
4521 * answer depends on whether we are encoding set O as itself, and also
4522 * on whether we are encoding whitespace as itself. RFC2152 makes it
4523 * clear that the answers to these questions vary between
4524 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004525
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526#define ENCODE_DIRECT(c, directO, directWS) \
4527 ((c) < 128 && (c) > 0 && \
4528 ((utf7_category[(c)] == 0) || \
4529 (directWS && (utf7_category[(c)] == 2)) || \
4530 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Alexander Belopolsky40018472011-02-26 01:02:56 +00004532PyObject *
4533PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004534 Py_ssize_t size,
4535 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4538}
4539
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540/* The decoder. The only state we preserve is our read position,
4541 * i.e. how many characters we have consumed. So if we end in the
4542 * middle of a shift sequence we have to back off the read position
4543 * and the output to the beginning of the sequence, otherwise we lose
4544 * all the shift state (seen bits, number of bits seen, high
4545 * surrogate). */
4546
Alexander Belopolsky40018472011-02-26 01:02:56 +00004547PyObject *
4548PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004549 Py_ssize_t size,
4550 const char *errors,
4551 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004554 Py_ssize_t startinpos;
4555 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 const char *errmsg = "";
4559 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004560 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 unsigned int base64bits = 0;
4562 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004563 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 PyObject *errorHandler = NULL;
4565 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004567 if (size == 0) {
4568 if (consumed)
4569 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004570 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004571 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004572
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004574 _PyUnicodeWriter_Init(&writer);
4575 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004576
4577 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 e = s + size;
4579
4580 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004581 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004583 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (inShift) { /* in a base-64 section */
4586 if (IS_BASE64(ch)) { /* consume a base-64 character */
4587 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4588 base64bits += 6;
4589 s++;
4590 if (base64bits >= 16) {
4591 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004592 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 base64bits -= 16;
4594 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004595 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 if (surrogate) {
4597 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004598 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4599 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004600 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004601 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004603 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 }
4605 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004606 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004607 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
4610 }
Victor Stinner551ac952011-11-29 22:58:13 +01004611 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 /* first surrogate */
4613 surrogate = outCh;
4614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004616 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004617 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 }
4619 }
4620 }
4621 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 if (base64bits > 0) { /* left-over bits */
4624 if (base64bits >= 6) {
4625 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004626 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 errmsg = "partial character in shift sequence";
4628 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 else {
4631 /* Some bits remain; they should be zero */
4632 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004633 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 errmsg = "non-zero padding bits in shift sequence";
4635 goto utf7Error;
4636 }
4637 }
4638 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004639 if (surrogate && DECODE_DIRECT(ch)) {
4640 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4641 goto onError;
4642 }
4643 surrogate = 0;
4644 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 /* '-' is absorbed; other terminating
4646 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004647 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
4650 }
4651 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 s++; /* consume '+' */
4654 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004656 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004657 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 }
4659 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004661 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004664 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 }
4666 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004669 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004670 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 else {
4673 startinpos = s-starts;
4674 s++;
4675 errmsg = "unexpected special character";
4676 goto utf7Error;
4677 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004681 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 errors, &errorHandler,
4683 "utf7", errmsg,
4684 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 }
4688
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 /* end of string */
4690
4691 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4692 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004693 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 if (surrogate ||
4695 (base64bits >= 6) ||
4696 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004698 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 errors, &errorHandler,
4700 "utf7", "unterminated shift sequence",
4701 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004702 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 goto onError;
4704 if (s < e)
4705 goto restart;
4706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708
4709 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004710 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004713 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004714 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004715 writer.kind, writer.data, shiftOutStart);
4716 Py_XDECREF(errorHandler);
4717 Py_XDECREF(exc);
4718 _PyUnicodeWriter_Dealloc(&writer);
4719 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004720 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004721 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 }
4723 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004724 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004726 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(errorHandler);
4729 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004730 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Benjamin Peterson29060642009-01-31 22:14:21 +00004732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_XDECREF(errorHandler);
4734 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 return NULL;
4737}
4738
4739
Alexander Belopolsky40018472011-02-26 01:02:56 +00004740PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741_PyUnicode_EncodeUTF7(PyObject *str,
4742 int base64SetO,
4743 int base64WhiteSpace,
4744 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004746 int kind;
4747 void *data;
4748 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004749 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 unsigned int base64bits = 0;
4753 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 char * out;
4755 char * start;
4756
Benjamin Petersonbac79492012-01-14 13:34:47 -05004757 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004758 return NULL;
4759 kind = PyUnicode_KIND(str);
4760 data = PyUnicode_DATA(str);
4761 len = PyUnicode_GET_LENGTH(str);
4762
4763 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004767 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004768 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004769 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 if (v == NULL)
4771 return NULL;
4772
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004773 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004774 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004775 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Antoine Pitrou244651a2009-05-04 18:56:13 +00004777 if (inShift) {
4778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4779 /* shifting out */
4780 if (base64bits) { /* output remaining bits */
4781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4782 base64buffer = 0;
4783 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 }
4785 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 /* Characters not in the BASE64 set implicitly unshift the sequence
4787 so no '-' is required, except if the character is itself a '-' */
4788 if (IS_BASE64(ch) || ch == '-') {
4789 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 *out++ = (char) ch;
4792 }
4793 else {
4794 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004795 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004796 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 else { /* not in a shift sequence */
4798 if (ch == '+') {
4799 *out++ = '+';
4800 *out++ = '-';
4801 }
4802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4803 *out++ = (char) ch;
4804 }
4805 else {
4806 *out++ = '+';
4807 inShift = 1;
4808 goto encode_char;
4809 }
4810 }
4811 continue;
4812encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004814 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004815
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* code first surrogate */
4817 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004818 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 while (base64bits >= 6) {
4820 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4821 base64bits -= 6;
4822 }
4823 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004824 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826 base64bits += 16;
4827 base64buffer = (base64buffer << 16) | ch;
4828 while (base64bits >= 6) {
4829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4830 base64bits -= 6;
4831 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004832 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833 if (base64bits)
4834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4835 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004836 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004837 if (_PyBytes_Resize(&v, out - start) < 0)
4838 return NULL;
4839 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004841PyObject *
4842PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4843 Py_ssize_t size,
4844 int base64SetO,
4845 int base64WhiteSpace,
4846 const char *errors)
4847{
4848 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004849 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850 if (tmp == NULL)
4851 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004852 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004853 base64WhiteSpace, errors);
4854 Py_DECREF(tmp);
4855 return result;
4856}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004857
Antoine Pitrou244651a2009-05-04 18:56:13 +00004858#undef IS_BASE64
4859#undef FROM_BASE64
4860#undef TO_BASE64
4861#undef DECODE_DIRECT
4862#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004863
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864/* --- UTF-8 Codec -------------------------------------------------------- */
4865
Alexander Belopolsky40018472011-02-26 01:02:56 +00004866PyObject *
4867PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald69652032004-09-07 20:24:22 +00004871 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4872}
4873
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874#include "stringlib/asciilib.h"
4875#include "stringlib/codecs.h"
4876#include "stringlib/undef.h"
4877
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004878#include "stringlib/ucs1lib.h"
4879#include "stringlib/codecs.h"
4880#include "stringlib/undef.h"
4881
4882#include "stringlib/ucs2lib.h"
4883#include "stringlib/codecs.h"
4884#include "stringlib/undef.h"
4885
4886#include "stringlib/ucs4lib.h"
4887#include "stringlib/codecs.h"
4888#include "stringlib/undef.h"
4889
Antoine Pitrouab868312009-01-10 15:40:25 +00004890/* Mask to quickly check whether a C 'long' contains a
4891 non-ASCII, UTF8-encoded char. */
4892#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004893# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004894#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004895# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004896#else
4897# error C 'long' size should be either 4 or 8!
4898#endif
4899
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900static Py_ssize_t
4901ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004904 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004906 /*
4907 * Issue #17237: m68k is a bit different from most architectures in
4908 * that objects do not use "natural alignment" - for example, int and
4909 * long are only aligned at 2-byte boundaries. Therefore the assert()
4910 * won't work; also, tests have shown that skipping the "optimised
4911 * version" will even speed up m68k.
4912 */
4913#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004915 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4916 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 /* Fast path, see in STRINGLIB(utf8_decode) for
4918 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004919 /* Help allocation */
4920 const char *_p = p;
4921 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 while (_p < aligned_end) {
4923 unsigned long value = *(const unsigned long *) _p;
4924 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 *((unsigned long *)q) = value;
4927 _p += SIZEOF_LONG;
4928 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004929 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 p = _p;
4931 while (p < end) {
4932 if ((unsigned char)*p & 0x80)
4933 break;
4934 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004939#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 while (p < end) {
4941 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4942 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004943 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004944 /* Help allocation */
4945 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004946 while (_p < aligned_end) {
4947 unsigned long value = *(unsigned long *) _p;
4948 if (value & ASCII_CHAR_MASK)
4949 break;
4950 _p += SIZEOF_LONG;
4951 }
4952 p = _p;
4953 if (_p == end)
4954 break;
4955 }
4956 if ((unsigned char)*p & 0x80)
4957 break;
4958 ++p;
4959 }
4960 memcpy(dest, start, p - start);
4961 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962}
Antoine Pitrouab868312009-01-10 15:40:25 +00004963
Victor Stinner785938e2011-12-11 20:09:03 +01004964PyObject *
4965PyUnicode_DecodeUTF8Stateful(const char *s,
4966 Py_ssize_t size,
4967 const char *errors,
4968 Py_ssize_t *consumed)
4969{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004970 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004971 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004973
4974 Py_ssize_t startinpos;
4975 Py_ssize_t endinpos;
4976 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004977 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 PyObject *exc = NULL;
Victor Stinner1d65d912015-10-05 13:43:50 +02004979 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner785938e2011-12-11 20:09:03 +01004980
4981 if (size == 0) {
4982 if (consumed)
4983 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004984 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004985 }
4986
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4988 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004989 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 *consumed = 1;
4991 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004992 }
4993
Victor Stinner8f674cc2013-04-17 23:02:17 +02004994 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004995 writer.min_length = size;
4996 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004998
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 writer.pos = ascii_decode(s, end, writer.data);
5000 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 while (s < end) {
5002 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005004
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 if (PyUnicode_IS_ASCII(writer.buffer))
5007 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 } else {
5013 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 }
5016
5017 switch (ch) {
5018 case 0:
5019 if (s == end || consumed)
5020 goto End;
5021 errmsg = "unexpected end of data";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 case 1:
5026 errmsg = "invalid start byte";
5027 startinpos = s - starts;
5028 endinpos = startinpos + 1;
5029 break;
5030 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005031 case 3:
5032 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 errmsg = "invalid continuation byte";
5034 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005035 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 break;
5037 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005038 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 goto onError;
5040 continue;
5041 }
5042
Victor Stinner1d65d912015-10-05 13:43:50 +02005043 if (error_handler == _Py_ERROR_UNKNOWN)
5044 error_handler = get_error_handler(errors);
5045
5046 switch (error_handler) {
5047 case _Py_ERROR_IGNORE:
5048 s += (endinpos - startinpos);
5049 break;
5050
5051 case _Py_ERROR_REPLACE:
5052 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5053 goto onError;
5054 s += (endinpos - startinpos);
5055 break;
5056
5057 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005058 {
5059 Py_ssize_t i;
5060
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5062 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005063 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005064 ch = (Py_UCS4)(unsigned char)(starts[i]);
5065 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5066 ch + 0xdc00);
5067 writer.pos++;
5068 }
5069 s += (endinpos - startinpos);
5070 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005071 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005072
5073 default:
5074 if (unicode_decode_call_errorhandler_writer(
5075 errors, &error_handler_obj,
5076 "utf-8", errmsg,
5077 &starts, &end, &startinpos, &endinpos, &exc, &s,
5078 &writer))
5079 goto onError;
5080 }
Victor Stinner785938e2011-12-11 20:09:03 +01005081 }
5082
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 if (consumed)
5085 *consumed = s - starts;
5086
Victor Stinner1d65d912015-10-05 13:43:50 +02005087 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005089 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090
5091onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005092 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005094 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005096}
5097
Xavier de Gaye76febd02016-12-15 20:59:58 +01005098#if defined(__APPLE__) || defined(__ANDROID__)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005099
5100/* Simplified UTF-8 decoder using surrogateescape error handler,
Xavier de Gaye76febd02016-12-15 20:59:58 +01005101 used to decode the command line arguments on Mac OS X and Android.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005102
5103 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005104 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105
5106wchar_t*
5107_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5108{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005109 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 wchar_t *unicode;
5111 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112
5113 /* Note: size will always be longer than the resulting Unicode
5114 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01005115 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005117 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118 if (!unicode)
5119 return NULL;
5120
5121 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005123 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005124 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005126#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005127 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005128#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005129 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 if (ch > 0xFF) {
5132#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005133 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005135 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 /* compute and append the two surrogates: */
5137 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5138 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5139#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 else {
5142 if (!ch && s == e)
5143 break;
5144 /* surrogateescape */
5145 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5146 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005147 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005149 return unicode;
5150}
5151
Xavier de Gaye76febd02016-12-15 20:59:58 +01005152#endif /* __APPLE__ or __ANDROID__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154/* Primary internal function which creates utf8 encoded bytes objects.
5155
5156 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005157 and allocate exactly as much space needed at the end. Else allocate the
5158 maximum possible needed (4 result bytes per Unicode character), and return
5159 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005160*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005161PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005162_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Victor Stinner6099a032011-12-18 14:22:26 +01005164 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 void *data;
5166 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 if (!PyUnicode_Check(unicode)) {
5169 PyErr_BadArgument();
5170 return NULL;
5171 }
5172
5173 if (PyUnicode_READY(unicode) == -1)
5174 return NULL;
5175
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005176 if (PyUnicode_UTF8(unicode))
5177 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5178 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179
5180 kind = PyUnicode_KIND(unicode);
5181 data = PyUnicode_DATA(unicode);
5182 size = PyUnicode_GET_LENGTH(unicode);
5183
Benjamin Petersonead6b532011-12-20 17:23:42 -06005184 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005185 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005186 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005187 case PyUnicode_1BYTE_KIND:
5188 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5189 assert(!PyUnicode_IS_ASCII(unicode));
5190 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5191 case PyUnicode_2BYTE_KIND:
5192 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5193 case PyUnicode_4BYTE_KIND:
5194 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196}
5197
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5200 Py_ssize_t size,
5201 const char *errors)
5202{
5203 PyObject *v, *unicode;
5204
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005205 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206 if (unicode == NULL)
5207 return NULL;
5208 v = _PyUnicode_AsUTF8String(unicode, errors);
5209 Py_DECREF(unicode);
5210 return v;
5211}
5212
5213PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005214PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005216 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217}
5218
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219/* --- UTF-32 Codec ------------------------------------------------------- */
5220
5221PyObject *
5222PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 Py_ssize_t size,
5224 const char *errors,
5225 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226{
5227 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5228}
5229
5230PyObject *
5231PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236{
5237 const char *starts = s;
5238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005240 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005241 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005242 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 PyObject *errorHandler = NULL;
5246 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005247
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248 q = (unsigned char *)s;
5249 e = q + size;
5250
5251 if (byteorder)
5252 bo = *byteorder;
5253
5254 /* Check for BOM marks (U+FEFF) in the input and adjust current
5255 byte order setting accordingly. In native mode, the leading BOM
5256 mark is skipped, in all other modes, it is copied to the output
5257 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005258 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005259 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005260 if (bom == 0x0000FEFF) {
5261 bo = -1;
5262 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005264 else if (bom == 0xFFFE0000) {
5265 bo = 1;
5266 q += 4;
5267 }
5268 if (byteorder)
5269 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005270 }
5271
Victor Stinnere64322e2012-10-30 23:12:47 +01005272 if (q == e) {
5273 if (consumed)
5274 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005275 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276 }
5277
Victor Stinnere64322e2012-10-30 23:12:47 +01005278#ifdef WORDS_BIGENDIAN
5279 le = bo < 0;
5280#else
5281 le = bo <= 0;
5282#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005283 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005284
Victor Stinner8f674cc2013-04-17 23:02:17 +02005285 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005286 writer.min_length = (e - q + 3) / 4;
5287 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005288 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005289
Victor Stinnere64322e2012-10-30 23:12:47 +01005290 while (1) {
5291 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005292 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005293
Victor Stinnere64322e2012-10-30 23:12:47 +01005294 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005295 enum PyUnicode_Kind kind = writer.kind;
5296 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005297 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005298 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005299 if (le) {
5300 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005301 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005302 if (ch > maxch)
5303 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005304 if (kind != PyUnicode_1BYTE_KIND &&
5305 Py_UNICODE_IS_SURROGATE(ch))
5306 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005307 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005308 q += 4;
5309 } while (q <= last);
5310 }
5311 else {
5312 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005313 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005314 if (ch > maxch)
5315 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005316 if (kind != PyUnicode_1BYTE_KIND &&
5317 Py_UNICODE_IS_SURROGATE(ch))
5318 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005319 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005320 q += 4;
5321 } while (q <= last);
5322 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005324 }
5325
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005326 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005327 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005328 startinpos = ((const char *)q) - starts;
5329 endinpos = startinpos + 4;
5330 }
5331 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005332 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005334 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005336 startinpos = ((const char *)q) - starts;
5337 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005339 else {
5340 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005341 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005342 goto onError;
5343 q += 4;
5344 continue;
5345 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005346 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005347 startinpos = ((const char *)q) - starts;
5348 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005350
5351 /* The remaining input chars are ignored if the callback
5352 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005353 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005355 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005357 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005359 }
5360
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005367
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005369 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370 Py_XDECREF(errorHandler);
5371 Py_XDECREF(exc);
5372 return NULL;
5373}
5374
5375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005376_PyUnicode_EncodeUTF32(PyObject *str,
5377 const char *errors,
5378 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005380 enum PyUnicode_Kind kind;
5381 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005382 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005383 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005384 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005385#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005386 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005388 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005389#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005390 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005391 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005392 PyObject *errorHandler = NULL;
5393 PyObject *exc = NULL;
5394 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 if (!PyUnicode_Check(str)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005400 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005401 return NULL;
5402 kind = PyUnicode_KIND(str);
5403 data = PyUnicode_DATA(str);
5404 len = PyUnicode_GET_LENGTH(str);
5405
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005406 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005407 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005408 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005409 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005410 if (v == NULL)
5411 return NULL;
5412
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005413 /* output buffer is 4-bytes aligned */
5414 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005415 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005417 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005419 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005421 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005422 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005423 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005424 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005425 else
5426 encoding = "utf-32";
5427
5428 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005429 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5430 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005433 pos = 0;
5434 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005436
5437 if (kind == PyUnicode_2BYTE_KIND) {
5438 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5439 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005440 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005441 else {
5442 assert(kind == PyUnicode_4BYTE_KIND);
5443 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5444 &out, native_ordering);
5445 }
5446 if (pos == len)
5447 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005448
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 rep = unicode_encode_call_errorhandler(
5450 errors, &errorHandler,
5451 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005452 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 if (!rep)
5454 goto error;
5455
5456 if (PyBytes_Check(rep)) {
5457 repsize = PyBytes_GET_SIZE(rep);
5458 if (repsize & 3) {
5459 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005460 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 "surrogates not allowed");
5462 goto error;
5463 }
5464 moreunits = repsize / 4;
5465 }
5466 else {
5467 assert(PyUnicode_Check(rep));
5468 if (PyUnicode_READY(rep) < 0)
5469 goto error;
5470 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5471 if (!PyUnicode_IS_ASCII(rep)) {
5472 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005473 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005474 "surrogates not allowed");
5475 goto error;
5476 }
5477 }
5478
5479 /* four bytes are reserved for each surrogate */
5480 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005481 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005482 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 /* integer overflow */
5484 PyErr_NoMemory();
5485 goto error;
5486 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005487 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005488 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005489 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005490 }
5491
5492 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005493 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005494 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005495 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005496 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005497 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5498 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 }
5500
5501 Py_CLEAR(rep);
5502 }
5503
5504 /* Cut back to size actually needed. This is necessary for, for example,
5505 encoding of a string containing isolated surrogates and the 'ignore'
5506 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005507 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 if (nsize != PyBytes_GET_SIZE(v))
5509 _PyBytes_Resize(&v, nsize);
5510 Py_XDECREF(errorHandler);
5511 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005512 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 error:
5515 Py_XDECREF(rep);
5516 Py_XDECREF(errorHandler);
5517 Py_XDECREF(exc);
5518 Py_XDECREF(v);
5519 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520}
5521
Alexander Belopolsky40018472011-02-26 01:02:56 +00005522PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5524 Py_ssize_t size,
5525 const char *errors,
5526 int byteorder)
5527{
5528 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005529 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530 if (tmp == NULL)
5531 return NULL;
5532 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5533 Py_DECREF(tmp);
5534 return result;
5535}
5536
5537PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005539{
Victor Stinnerb960b342011-11-20 19:12:52 +01005540 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005541}
5542
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543/* --- UTF-16 Codec ------------------------------------------------------- */
5544
Tim Peters772747b2001-08-09 22:21:55 +00005545PyObject *
5546PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 Py_ssize_t size,
5548 const char *errors,
5549 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
Walter Dörwald69652032004-09-07 20:24:22 +00005551 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5552}
5553
5554PyObject *
5555PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 Py_ssize_t size,
5557 const char *errors,
5558 int *byteorder,
5559 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005561 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t startinpos;
5563 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005565 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005566 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005567 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005568 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 PyObject *errorHandler = NULL;
5570 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572
Tim Peters772747b2001-08-09 22:21:55 +00005573 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005574 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
5576 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005577 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005579 /* Check for BOM marks (U+FEFF) in the input and adjust current
5580 byte order setting accordingly. In native mode, the leading BOM
5581 mark is skipped, in all other modes, it is copied to the output
5582 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005583 if (bo == 0 && size >= 2) {
5584 const Py_UCS4 bom = (q[1] << 8) | q[0];
5585 if (bom == 0xFEFF) {
5586 q += 2;
5587 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005589 else if (bom == 0xFFFE) {
5590 q += 2;
5591 bo = 1;
5592 }
5593 if (byteorder)
5594 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596
Antoine Pitrou63065d72012-05-15 23:48:04 +02005597 if (q == e) {
5598 if (consumed)
5599 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005600 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005601 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005602
Christian Heimes743e0cd2012-10-17 23:52:17 +02005603#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005604 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005605 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005606#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005607 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005609#endif
Tim Peters772747b2001-08-09 22:21:55 +00005610
Antoine Pitrou63065d72012-05-15 23:48:04 +02005611 /* Note: size will always be longer than the resulting Unicode
5612 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005613 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005614 writer.min_length = (e - q + 1) / 2;
5615 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005616 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005617
Antoine Pitrou63065d72012-05-15 23:48:04 +02005618 while (1) {
5619 Py_UCS4 ch = 0;
5620 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005621 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005622 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005624 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005626 native_ordering);
5627 else
5628 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005630 native_ordering);
5631 } else if (kind == PyUnicode_2BYTE_KIND) {
5632 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005634 native_ordering);
5635 } else {
5636 assert(kind == PyUnicode_4BYTE_KIND);
5637 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005639 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005640 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005641 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642
Antoine Pitrou63065d72012-05-15 23:48:04 +02005643 switch (ch)
5644 {
5645 case 0:
5646 /* remaining byte at the end? (size should be even) */
5647 if (q == e || consumed)
5648 goto End;
5649 errmsg = "truncated data";
5650 startinpos = ((const char *)q) - starts;
5651 endinpos = ((const char *)e) - starts;
5652 break;
5653 /* The remaining input chars are ignored if the callback
5654 chooses to skip the input */
5655 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005656 q -= 2;
5657 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005658 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005659 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005660 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005661 endinpos = ((const char *)e) - starts;
5662 break;
5663 case 2:
5664 errmsg = "illegal encoding";
5665 startinpos = ((const char *)q) - 2 - starts;
5666 endinpos = startinpos + 2;
5667 break;
5668 case 3:
5669 errmsg = "illegal UTF-16 surrogate";
5670 startinpos = ((const char *)q) - 4 - starts;
5671 endinpos = startinpos + 2;
5672 break;
5673 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005674 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 continue;
5677 }
5678
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005680 errors,
5681 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005682 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005683 &starts,
5684 (const char **)&e,
5685 &startinpos,
5686 &endinpos,
5687 &exc,
5688 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 }
5692
Antoine Pitrou63065d72012-05-15 23:48:04 +02005693End:
Walter Dörwald69652032004-09-07 20:24:22 +00005694 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 Py_XDECREF(errorHandler);
5698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005702 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 Py_XDECREF(errorHandler);
5704 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return NULL;
5706}
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005709_PyUnicode_EncodeUTF16(PyObject *str,
5710 const char *errors,
5711 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005713 enum PyUnicode_Kind kind;
5714 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005715 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005716 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005717 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005718 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005719#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005720 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005721#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005722 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005723#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 const char *encoding;
5725 Py_ssize_t nsize, pos;
5726 PyObject *errorHandler = NULL;
5727 PyObject *exc = NULL;
5728 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 if (!PyUnicode_Check(str)) {
5731 PyErr_BadArgument();
5732 return NULL;
5733 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 return NULL;
5736 kind = PyUnicode_KIND(str);
5737 data = PyUnicode_DATA(str);
5738 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005739
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005741 if (kind == PyUnicode_4BYTE_KIND) {
5742 const Py_UCS4 *in = (const Py_UCS4 *)data;
5743 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005744 while (in < end) {
5745 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005746 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005747 }
5748 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005749 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005750 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005752 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 nsize = len + pairs + (byteorder == 0);
5754 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005755 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005757 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005759 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005760 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005761 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005762 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005763 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005764 }
5765 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005766 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005767 }
Tim Peters772747b2001-08-09 22:21:55 +00005768
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 if (kind == PyUnicode_1BYTE_KIND) {
5770 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5771 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005772 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005773
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005774 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005776 }
5777 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005778 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005779 }
5780 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005782 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783
5784 pos = 0;
5785 while (pos < len) {
5786 Py_ssize_t repsize, moreunits;
5787
5788 if (kind == PyUnicode_2BYTE_KIND) {
5789 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5790 &out, native_ordering);
5791 }
5792 else {
5793 assert(kind == PyUnicode_4BYTE_KIND);
5794 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5795 &out, native_ordering);
5796 }
5797 if (pos == len)
5798 break;
5799
5800 rep = unicode_encode_call_errorhandler(
5801 errors, &errorHandler,
5802 encoding, "surrogates not allowed",
5803 str, &exc, pos, pos + 1, &pos);
5804 if (!rep)
5805 goto error;
5806
5807 if (PyBytes_Check(rep)) {
5808 repsize = PyBytes_GET_SIZE(rep);
5809 if (repsize & 1) {
5810 raise_encode_exception(&exc, encoding,
5811 str, pos - 1, pos,
5812 "surrogates not allowed");
5813 goto error;
5814 }
5815 moreunits = repsize / 2;
5816 }
5817 else {
5818 assert(PyUnicode_Check(rep));
5819 if (PyUnicode_READY(rep) < 0)
5820 goto error;
5821 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5822 if (!PyUnicode_IS_ASCII(rep)) {
5823 raise_encode_exception(&exc, encoding,
5824 str, pos - 1, pos,
5825 "surrogates not allowed");
5826 goto error;
5827 }
5828 }
5829
5830 /* two bytes are reserved for each surrogate */
5831 if (moreunits > 1) {
5832 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005833 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 /* integer overflow */
5835 PyErr_NoMemory();
5836 goto error;
5837 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005838 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 goto error;
5840 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5841 }
5842
5843 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005844 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 out += moreunits;
5846 } else /* rep is unicode */ {
5847 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5848 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5849 &out, native_ordering);
5850 }
5851
5852 Py_CLEAR(rep);
5853 }
5854
5855 /* Cut back to size actually needed. This is necessary for, for example,
5856 encoding of a string containing isolated surrogates and the 'ignore' handler
5857 is used. */
5858 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5859 if (nsize != PyBytes_GET_SIZE(v))
5860 _PyBytes_Resize(&v, nsize);
5861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005863 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005864 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005865 error:
5866 Py_XDECREF(rep);
5867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
5869 Py_XDECREF(v);
5870 return NULL;
5871#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872}
5873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5876 Py_ssize_t size,
5877 const char *errors,
5878 int byteorder)
5879{
5880 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005881 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 if (tmp == NULL)
5883 return NULL;
5884 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5885 Py_DECREF(tmp);
5886 return result;
5887}
5888
5889PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893}
5894
5895/* --- Unicode Escape Codec ----------------------------------------------- */
5896
Fredrik Lundh06d12682001-01-24 07:59:11 +00005897static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04005900_PyUnicode_DecodeUnicodeEscape(const char *s,
5901 Py_ssize_t size,
5902 const char *errors,
5903 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005908 PyObject *errorHandler = NULL;
5909 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005910
Eric V. Smith42454af2016-10-31 09:22:08 -04005911 // so we can remember if we've seen an invalid escape char or not
5912 *first_invalid_escape = NULL;
5913
Victor Stinner62ec3312016-09-06 17:04:34 -07005914 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005915 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07005916 }
5917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
5919 length after conversion to the true value.
5920 (but if the error callback returns a long replacement string
5921 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005922 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07005923 writer.min_length = size;
5924 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5925 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005926 }
5927
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 end = s + size;
5929 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07005930 unsigned char c = (unsigned char) *s++;
5931 Py_UCS4 ch;
5932 int count;
5933 Py_ssize_t startinpos;
5934 Py_ssize_t endinpos;
5935 const char *message;
5936
5937#define WRITE_ASCII_CHAR(ch) \
5938 do { \
5939 assert(ch <= 127); \
5940 assert(writer.pos < writer.size); \
5941 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5942 } while(0)
5943
5944#define WRITE_CHAR(ch) \
5945 do { \
5946 if (ch <= writer.maxchar) { \
5947 assert(writer.pos < writer.size); \
5948 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5949 } \
5950 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5951 goto onError; \
5952 } \
5953 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
5955 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07005956 if (c != '\\') {
5957 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 continue;
5959 }
5960
Victor Stinner62ec3312016-09-06 17:04:34 -07005961 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005963 if (s >= end) {
5964 message = "\\ at end of string";
5965 goto error;
5966 }
5967 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968
Victor Stinner62ec3312016-09-06 17:04:34 -07005969 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005970 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07005973 case '\n': continue;
5974 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5975 case '\'': WRITE_ASCII_CHAR('\''); continue;
5976 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5977 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005978 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07005979 case 'f': WRITE_ASCII_CHAR('\014'); continue;
5980 case 't': WRITE_ASCII_CHAR('\t'); continue;
5981 case 'n': WRITE_ASCII_CHAR('\n'); continue;
5982 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07005984 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07005986 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 case '0': case '1': case '2': case '3':
5990 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07005991 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005992 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07005993 ch = (ch<<3) + *s++ - '0';
5994 if (s < end && '0' <= *s && *s <= '7') {
5995 ch = (ch<<3) + *s++ - '0';
5996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Victor Stinner62ec3312016-09-06 17:04:34 -07005998 WRITE_CHAR(ch);
5999 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 /* hex escapes */
6002 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006004 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006005 message = "truncated \\xXX escape";
6006 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006010 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006011 message = "truncated \\uXXXX escape";
6012 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006015 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006016 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006017 message = "truncated \\UXXXXXXXX escape";
6018 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006019 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006020 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006021 ch <<= 4;
6022 if (c >= '0' && c <= '9') {
6023 ch += c - '0';
6024 }
6025 else if (c >= 'a' && c <= 'f') {
6026 ch += c - ('a' - 10);
6027 }
6028 else if (c >= 'A' && c <= 'F') {
6029 ch += c - ('A' - 10);
6030 }
6031 else {
6032 break;
6033 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006034 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006035 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006036 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006037 }
6038
6039 /* when we get here, ch is a 32-bit unicode character */
6040 if (ch > MAX_UNICODE) {
6041 message = "illegal Unicode character";
6042 goto error;
6043 }
6044
6045 WRITE_CHAR(ch);
6046 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006049 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006050 if (ucnhash_CAPI == NULL) {
6051 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6053 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006054 if (ucnhash_CAPI == NULL) {
6055 PyErr_SetString(
6056 PyExc_UnicodeError,
6057 "\\N escapes not supported (can't load unicodedata module)"
6058 );
6059 goto onError;
6060 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006061 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006062
6063 message = "malformed \\N character escape";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006064 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006065 const char *start = ++s;
6066 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006067 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006069 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 namelen = s - start;
6071 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006073 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 ch = 0xffffffff; /* in case 'getcode' messes up */
6075 if (namelen <= INT_MAX &&
6076 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6077 &ch, 0)) {
6078 assert(ch <= MAX_UNICODE);
6079 WRITE_CHAR(ch);
6080 continue;
6081 }
6082 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006083 }
6084 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006085 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006086
6087 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006088 if (*first_invalid_escape == NULL) {
6089 *first_invalid_escape = s-1; /* Back up one char, since we've
6090 already incremented s. */
6091 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 WRITE_ASCII_CHAR('\\');
6093 WRITE_CHAR(c);
6094 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006096
6097 error:
6098 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006099 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006100 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006101 errors, &errorHandler,
6102 "unicodeescape", message,
6103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006104 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006105 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006106 }
6107 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6108 goto onError;
6109 }
6110
6111#undef WRITE_ASCII_CHAR
6112#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006114
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006115 Py_XDECREF(errorHandler);
6116 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006117 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006118
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006120 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 Py_XDECREF(errorHandler);
6122 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 return NULL;
6124}
6125
Eric V. Smith42454af2016-10-31 09:22:08 -04006126PyObject *
6127PyUnicode_DecodeUnicodeEscape(const char *s,
6128 Py_ssize_t size,
6129 const char *errors)
6130{
6131 const char *first_invalid_escape;
6132 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6133 &first_invalid_escape);
6134 if (result == NULL)
6135 return NULL;
6136 if (first_invalid_escape != NULL) {
6137 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6138 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006139 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006140 Py_DECREF(result);
6141 return NULL;
6142 }
6143 }
6144 return result;
6145}
6146
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006147/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148
Alexander Belopolsky40018472011-02-26 01:02:56 +00006149PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Ezio Melottie7f90372012-10-05 03:33:31 +03006159 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006160 escape.
6161
Ezio Melottie7f90372012-10-05 03:33:31 +03006162 For UCS1 strings it's '\xxx', 4 bytes per source character.
6163 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6164 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006165 */
6166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 if (!PyUnicode_Check(unicode)) {
6168 PyErr_BadArgument();
6169 return NULL;
6170 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 }
Victor Stinner358af132015-10-12 22:36:57 +02006174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 if (len == 0) {
6177 return PyBytes_FromStringAndSize(NULL, 0);
6178 }
6179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 kind = PyUnicode_KIND(unicode);
6181 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6183 bytes, and 1 byte characters 4. */
6184 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006185 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006186 return PyErr_NoMemory();
6187 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006188 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (repr == NULL) {
6190 return NULL;
6191 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006195 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 /* U+0000-U+00ff range */
6198 if (ch < 0x100) {
6199 if (ch >= ' ' && ch < 127) {
6200 if (ch != '\\') {
6201 /* Copy printable US ASCII as-is */
6202 *p++ = (char) ch;
6203 }
6204 /* Escape backslashes */
6205 else {
6206 *p++ = '\\';
6207 *p++ = '\\';
6208 }
6209 }
Victor Stinner358af132015-10-12 22:36:57 +02006210
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 /* Map special whitespace to '\t', \n', '\r' */
6212 else if (ch == '\t') {
6213 *p++ = '\\';
6214 *p++ = 't';
6215 }
6216 else if (ch == '\n') {
6217 *p++ = '\\';
6218 *p++ = 'n';
6219 }
6220 else if (ch == '\r') {
6221 *p++ = '\\';
6222 *p++ = 'r';
6223 }
6224
6225 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6226 else {
6227 *p++ = '\\';
6228 *p++ = 'x';
6229 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6230 *p++ = Py_hexdigits[ch & 0x000F];
6231 }
Tim Petersced69f82003-09-16 20:30:58 +00006232 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006233 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 *p++ = '\\';
6236 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006237 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6238 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6239 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6240 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6243 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006244
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 /* Make sure that the first two digits are zero */
6246 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006247 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 *p++ = 'U';
6249 *p++ = '0';
6250 *p++ = '0';
6251 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6256 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 assert(p - PyBytes_AS_STRING(repr) > 0);
6261 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6262 return NULL;
6263 }
6264 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265}
6266
Alexander Belopolsky40018472011-02-26 01:02:56 +00006267PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006268PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006271 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006272 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006273 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 }
6276
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006277 result = PyUnicode_AsUnicodeEscapeString(tmp);
6278 Py_DECREF(tmp);
6279 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280}
6281
6282/* --- Raw Unicode Escape Codec ------------------------------------------- */
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006286 Py_ssize_t size,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006290 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 PyObject *errorHandler = NULL;
6293 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006294
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006296 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006298
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 /* Escaped strings will always be longer than the resulting
6300 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 length after conversion to the true value. (But decoding error
6302 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006303 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 writer.min_length = size;
6305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6306 goto onError;
6307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006308
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 end = s + size;
6310 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 unsigned char c = (unsigned char) *s++;
6312 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006313 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 Py_ssize_t startinpos;
6315 Py_ssize_t endinpos;
6316 const char *message;
6317
6318#define WRITE_CHAR(ch) \
6319 do { \
6320 if (ch <= writer.maxchar) { \
6321 assert(writer.pos < writer.size); \
6322 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6323 } \
6324 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6325 goto onError; \
6326 } \
6327 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (c != '\\' || s >= end) {
6331 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006334
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 c = (unsigned char) *s++;
6336 if (c == 'u') {
6337 count = 4;
6338 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 else if (c == 'U') {
6341 count = 8;
6342 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 }
6344 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 assert(writer.pos < writer.size);
6346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6347 WRITE_CHAR(c);
6348 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 startinpos = s - starts - 2;
6351
6352 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6353 for (ch = 0; count && s < end; ++s, --count) {
6354 c = (unsigned char)*s;
6355 ch <<= 4;
6356 if (c >= '0' && c <= '9') {
6357 ch += c - '0';
6358 }
6359 else if (c >= 'a' && c <= 'f') {
6360 ch += c - ('a' - 10);
6361 }
6362 else if (c >= 'A' && c <= 'F') {
6363 ch += c - ('A' - 10);
6364 }
6365 else {
6366 break;
6367 }
6368 }
6369 if (!count) {
6370 if (ch <= MAX_UNICODE) {
6371 WRITE_CHAR(ch);
6372 continue;
6373 }
6374 message = "\\Uxxxxxxxx out of range";
6375 }
6376
6377 endinpos = s-starts;
6378 writer.min_length = end - s + writer.pos;
6379 if (unicode_decode_call_errorhandler_writer(
6380 errors, &errorHandler,
6381 "rawunicodeescape", message,
6382 &starts, &end, &startinpos, &endinpos, &exc, &s,
6383 &writer)) {
6384 goto onError;
6385 }
6386 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) {
6387 goto onError;
6388 }
6389
6390#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006394 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006395
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006397 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 Py_XDECREF(errorHandler);
6399 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402}
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404
Alexander Belopolsky40018472011-02-26 01:02:56 +00006405PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 int kind;
6412 void *data;
6413 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 if (!PyUnicode_Check(unicode)) {
6416 PyErr_BadArgument();
6417 return NULL;
6418 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422 kind = PyUnicode_KIND(unicode);
6423 data = PyUnicode_DATA(unicode);
6424 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 if (kind == PyUnicode_1BYTE_KIND) {
6426 return PyBytes_FromStringAndSize(data, len);
6427 }
Victor Stinner0e368262011-11-10 20:12:49 +01006428
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6430 bytes, and 1 byte characters 4. */
6431 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006432
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 if (len > PY_SSIZE_T_MAX / expandsize) {
6434 return PyErr_NoMemory();
6435 }
6436 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6437 if (repr == NULL) {
6438 return NULL;
6439 }
6440 if (len == 0) {
6441 return repr;
6442 }
6443
6444 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 for (pos = 0; pos < len; pos++) {
6446 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006447
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6449 if (ch < 0x100) {
6450 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006451 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6453 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 *p++ = '\\';
6455 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006456 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6457 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6458 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6459 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6462 else {
6463 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6464 *p++ = '\\';
6465 *p++ = 'U';
6466 *p++ = '0';
6467 *p++ = '0';
6468 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6473 *p++ = Py_hexdigits[ch & 15];
6474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 assert(p > PyBytes_AS_STRING(repr));
6478 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6479 return NULL;
6480 }
6481 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482}
6483
Alexander Belopolsky40018472011-02-26 01:02:56 +00006484PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6486 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006488 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006489 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006491 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6493 Py_DECREF(tmp);
6494 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006497/* --- Unicode Internal Codec ------------------------------------------- */
6498
Alexander Belopolsky40018472011-02-26 01:02:56 +00006499PyObject *
6500_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006501 Py_ssize_t size,
6502 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006503{
6504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006505 Py_ssize_t startinpos;
6506 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006507 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508 const char *end;
6509 const char *reason;
6510 PyObject *errorHandler = NULL;
6511 PyObject *exc = NULL;
6512
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006513 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006514 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006515 1))
6516 return NULL;
6517
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006518 if (size < 0) {
6519 PyErr_BadInternalCall();
6520 return NULL;
6521 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006522 if (size == 0)
6523 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006524
Victor Stinner8f674cc2013-04-17 23:02:17 +02006525 _PyUnicodeWriter_Init(&writer);
6526 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6527 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 }
6530 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006531
Victor Stinner8f674cc2013-04-17 23:02:17 +02006532 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006533 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006534 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006535 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006536 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006537 endinpos = end-starts;
6538 reason = "truncated input";
6539 goto error;
6540 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006541 /* We copy the raw representation one byte at a time because the
6542 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006543 ((char *) &uch)[0] = s[0];
6544 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006546 ((char *) &uch)[2] = s[2];
6547 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006548#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006549 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006550#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006551 /* We have to sanity check the raw data, otherwise doom looms for
6552 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006553 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006554 endinpos = s - starts + Py_UNICODE_SIZE;
6555 reason = "illegal code point (> 0x10FFFF)";
6556 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006557 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006558#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006559 s += Py_UNICODE_SIZE;
6560#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006561 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006562 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006563 Py_UNICODE uch2;
6564 ((char *) &uch2)[0] = s[0];
6565 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006566 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006567 {
Victor Stinner551ac952011-11-29 22:58:13 +01006568 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006569 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 }
6571 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006572#endif
6573
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006574 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006575 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006576 continue;
6577
6578 error:
6579 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006580 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006581 errors, &errorHandler,
6582 "unicode_internal", reason,
6583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006584 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006585 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006586 }
6587
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006588 Py_XDECREF(errorHandler);
6589 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006590 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006591
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006593 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
6596 return NULL;
6597}
6598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599/* --- Latin-1 Codec ------------------------------------------------------ */
6600
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601PyObject *
6602PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006603 Py_ssize_t size,
6604 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006607 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611static void
6612make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006614 PyObject *unicode,
6615 Py_ssize_t startpos, Py_ssize_t endpos,
6616 const char *reason)
6617{
6618 if (*exceptionObject == NULL) {
6619 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006621 encoding, unicode, startpos, endpos, reason);
6622 }
6623 else {
6624 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6625 goto onError;
6626 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6627 goto onError;
6628 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6629 goto onError;
6630 return;
6631 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006632 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006633 }
6634}
6635
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637static void
6638raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006639 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006640 PyObject *unicode,
6641 Py_ssize_t startpos, Py_ssize_t endpos,
6642 const char *reason)
6643{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006644 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006645 encoding, unicode, startpos, endpos, reason);
6646 if (*exceptionObject != NULL)
6647 PyCodec_StrictErrors(*exceptionObject);
6648}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649
6650/* error handling callback helper:
6651 build arguments, call the callback and check the arguments,
6652 put the result into newpos and return the replacement string, which
6653 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654static PyObject *
6655unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 PyObject **errorHandler,
6657 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006658 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 Py_ssize_t startpos, Py_ssize_t endpos,
6660 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006662 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 PyObject *restuple;
6665 PyObject *resunicode;
6666
6667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 }
6672
Benjamin Petersonbac79492012-01-14 13:34:47 -05006673 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return NULL;
6675 len = PyUnicode_GET_LENGTH(unicode);
6676
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006677 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006679 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006682 restuple = PyObject_CallFunctionObjArgs(
6683 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006687 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 Py_DECREF(restuple);
6689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006691 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006692 &resunicode, newpos)) {
6693 Py_DECREF(restuple);
6694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006696 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6697 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6698 Py_DECREF(restuple);
6699 return NULL;
6700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 *newpos = len + *newpos;
6703 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006704 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 Py_DECREF(restuple);
6706 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 Py_INCREF(resunicode);
6709 Py_DECREF(restuple);
6710 return resunicode;
6711}
6712
Alexander Belopolsky40018472011-02-26 01:02:56 +00006713static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006715 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006716 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 /* input state */
6719 Py_ssize_t pos=0, size;
6720 int kind;
6721 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 /* pointer into the output */
6723 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006724 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6725 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006726 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006728 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006729 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006730 /* output object */
6731 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732
Benjamin Petersonbac79492012-01-14 13:34:47 -05006733 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 return NULL;
6735 size = PyUnicode_GET_LENGTH(unicode);
6736 kind = PyUnicode_KIND(unicode);
6737 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 /* allocate enough for a simple encoding without
6739 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006740 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006741 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006742
6743 _PyBytesWriter_Init(&writer);
6744 str = _PyBytesWriter_Alloc(&writer, size);
6745 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006746 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006749 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006752 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006754 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006756 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006758 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006761 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006763
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006764 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006766
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006767 /* Only overallocate the buffer if it's not the last write */
6768 writer.overallocate = (collend < size);
6769
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006771 if (error_handler == _Py_ERROR_UNKNOWN)
6772 error_handler = get_error_handler(errors);
6773
6774 switch (error_handler) {
6775 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006776 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006778
6779 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006780 memset(str, '?', collend - collstart);
6781 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006782 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006783 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 break;
Victor Stinner50149202015-09-22 00:26:54 +02006786
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006787 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006788 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006789 writer.min_size -= (collend - collstart);
6790 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006791 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006792 if (str == NULL)
6793 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006794 pos = collend;
6795 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006797 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006798 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006799 writer.min_size -= (collend - collstart);
6800 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006801 unicode, collstart, collend);
6802 if (str == NULL)
6803 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 break;
Victor Stinner50149202015-09-22 00:26:54 +02006806
Victor Stinnerc3713e92015-09-29 12:32:13 +02006807 case _Py_ERROR_SURROGATEESCAPE:
6808 for (i = collstart; i < collend; ++i) {
6809 ch = PyUnicode_READ(kind, data, i);
6810 if (ch < 0xdc80 || 0xdcff < ch) {
6811 /* Not a UTF-8b surrogate */
6812 break;
6813 }
6814 *str++ = (char)(ch - 0xdc00);
6815 ++pos;
6816 }
6817 if (i >= collend)
6818 break;
6819 collstart = pos;
6820 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006821 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006822
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006824 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6825 encoding, reason, unicode, &exc,
6826 collstart, collend, &newpos);
6827 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006829
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006830 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006831 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006832
Victor Stinner6bd525b2015-10-09 13:10:05 +02006833 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006834 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006835 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006836 PyBytes_AS_STRING(rep),
6837 PyBytes_GET_SIZE(rep));
Victor Stinnerad771582015-10-09 12:38:53 +02006838 if (str == NULL)
6839 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006840 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006841 else {
6842 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006843
Victor Stinner6bd525b2015-10-09 13:10:05 +02006844 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006846
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006847 if (limit == 256 ?
6848 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6849 !PyUnicode_IS_ASCII(rep))
6850 {
6851 /* Not all characters are smaller than limit */
6852 raise_encode_exception(&exc, encoding, unicode,
6853 collstart, collend, reason);
6854 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006856 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6857 str = _PyBytesWriter_WriteBytes(&writer, str,
6858 PyUnicode_DATA(rep),
6859 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006861 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006862 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006863 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006864
6865 /* If overallocation was disabled, ensure that it was the last
6866 write. Otherwise, we missed an optimization */
6867 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868 }
6869 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006870
Victor Stinner50149202015-09-22 00:26:54 +02006871 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006873 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006874
6875 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006876 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006877 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006878 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006879 Py_XDECREF(exc);
6880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881}
6882
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006884PyObject *
6885PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006886 Py_ssize_t size,
6887 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006890 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 if (unicode == NULL)
6892 return NULL;
6893 result = unicode_encode_ucs1(unicode, errors, 256);
6894 Py_DECREF(unicode);
6895 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896}
6897
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006899_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900{
6901 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 PyErr_BadArgument();
6903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905 if (PyUnicode_READY(unicode) == -1)
6906 return NULL;
6907 /* Fast path: if it is a one-byte string, construct
6908 bytes object directly. */
6909 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6910 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6911 PyUnicode_GET_LENGTH(unicode));
6912 /* Non-Latin-1 characters present. Defer to above function to
6913 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006915}
6916
6917PyObject*
6918PyUnicode_AsLatin1String(PyObject *unicode)
6919{
6920 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
6923/* --- 7-bit ASCII Codec -------------------------------------------------- */
6924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_DecodeASCII(const char *s,
6927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006931 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006932 int kind;
6933 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934 Py_ssize_t startinpos;
6935 Py_ssize_t endinpos;
6936 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006938 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006940 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006943 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006946 if (size == 1 && (unsigned char)s[0] < 128)
6947 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006948
Victor Stinner8f674cc2013-04-17 23:02:17 +02006949 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006950 writer.min_length = size;
6951 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006952 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006955 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006956 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006957 writer.pos = outpos;
6958 if (writer.pos == size)
6959 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006960
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006961 s += writer.pos;
6962 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006964 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 PyUnicode_WRITE(kind, data, writer.pos, c);
6967 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006969 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02006971
6972 /* byte outsize range 0x00..0x7f: call the error handler */
6973
6974 if (error_handler == _Py_ERROR_UNKNOWN)
6975 error_handler = get_error_handler(errors);
6976
6977 switch (error_handler)
6978 {
6979 case _Py_ERROR_REPLACE:
6980 case _Py_ERROR_SURROGATEESCAPE:
6981 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02006982 but we may switch to UCS2 at the first write */
6983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6984 goto onError;
6985 kind = writer.kind;
6986 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987
6988 if (error_handler == _Py_ERROR_REPLACE)
6989 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6990 else
6991 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6992 writer.pos++;
6993 ++s;
6994 break;
6995
6996 case _Py_ERROR_IGNORE:
6997 ++s;
6998 break;
6999
7000 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 startinpos = s-starts;
7002 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007003 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007004 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 "ascii", "ordinal not in range(128)",
7006 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007007 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 kind = writer.kind;
7010 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007013 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007015 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007016
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007019 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 return NULL;
7022}
7023
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007024/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
7026PyUnicode_EncodeASCII(const Py_UNICODE *p,
7027 Py_ssize_t size,
7028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007030 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007031 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 if (unicode == NULL)
7033 return NULL;
7034 result = unicode_encode_ucs1(unicode, errors, 128);
7035 Py_DECREF(unicode);
7036 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037}
7038
Alexander Belopolsky40018472011-02-26 01:02:56 +00007039PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041{
7042 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 PyErr_BadArgument();
7044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007046 if (PyUnicode_READY(unicode) == -1)
7047 return NULL;
7048 /* Fast path: if it is an ASCII-only string, construct bytes object
7049 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007050 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007051 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7052 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007054}
7055
7056PyObject *
7057PyUnicode_AsASCIIString(PyObject *unicode)
7058{
7059 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060}
7061
Steve Dowercc16be82016-09-08 10:35:16 -07007062#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007063
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007064/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007065
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007066#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067#define NEED_RETRY
7068#endif
7069
Victor Stinner3a50e702011-10-18 21:21:00 +02007070#ifndef WC_ERR_INVALID_CHARS
7071# define WC_ERR_INVALID_CHARS 0x0080
7072#endif
7073
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007074static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007075code_page_name(UINT code_page, PyObject **obj)
7076{
7077 *obj = NULL;
7078 if (code_page == CP_ACP)
7079 return "mbcs";
7080 if (code_page == CP_UTF7)
7081 return "CP_UTF7";
7082 if (code_page == CP_UTF8)
7083 return "CP_UTF8";
7084
7085 *obj = PyBytes_FromFormat("cp%u", code_page);
7086 if (*obj == NULL)
7087 return NULL;
7088 return PyBytes_AS_STRING(*obj);
7089}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090
Victor Stinner3a50e702011-10-18 21:21:00 +02007091static DWORD
7092decode_code_page_flags(UINT code_page)
7093{
7094 if (code_page == CP_UTF7) {
7095 /* The CP_UTF7 decoder only supports flags=0 */
7096 return 0;
7097 }
7098 else
7099 return MB_ERR_INVALID_CHARS;
7100}
7101
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007103 * Decode a byte string from a Windows code page into unicode object in strict
7104 * mode.
7105 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007106 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7107 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007109static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007110decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 const char *in,
7113 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114{
Victor Stinner3a50e702011-10-18 21:21:00 +02007115 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007116 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118
7119 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 assert(insize > 0);
7121 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7122 if (outsize <= 0)
7123 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124
7125 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007127 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007128 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 if (*v == NULL)
7130 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007132 }
7133 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007136 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139 }
7140
7141 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007142 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7143 if (outsize <= 0)
7144 goto error;
7145 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007146
Victor Stinner3a50e702011-10-18 21:21:00 +02007147error:
7148 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7149 return -2;
7150 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007151 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007152}
7153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154/*
7155 * Decode a byte string from a code page into unicode object with an error
7156 * handler.
7157 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007158 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 * UnicodeDecodeError exception and returns -1 on error.
7160 */
7161static int
7162decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007163 PyObject **v,
7164 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007165 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007166{
7167 const char *startin = in;
7168 const char *endin = in + size;
7169 const DWORD flags = decode_code_page_flags(code_page);
7170 /* Ideally, we should get reason from FormatMessage. This is the Windows
7171 2000 English version of the message. */
7172 const char *reason = "No mapping for the Unicode character exists "
7173 "in the target code page.";
7174 /* each step cannot decode more than 1 character, but a character can be
7175 represented as a surrogate pair */
7176 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007177 int insize;
7178 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 PyObject *errorHandler = NULL;
7180 PyObject *exc = NULL;
7181 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007182 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 DWORD err;
7184 int ret = -1;
7185
7186 assert(size > 0);
7187
7188 encoding = code_page_name(code_page, &encoding_obj);
7189 if (encoding == NULL)
7190 return -1;
7191
Victor Stinner7d00cc12014-03-17 23:08:06 +01007192 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7194 UnicodeDecodeError. */
7195 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7196 if (exc != NULL) {
7197 PyCodec_StrictErrors(exc);
7198 Py_CLEAR(exc);
7199 }
7200 goto error;
7201 }
7202
7203 if (*v == NULL) {
7204 /* Create unicode object */
7205 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7206 PyErr_NoMemory();
7207 goto error;
7208 }
Victor Stinnerab595942011-12-17 04:59:06 +01007209 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 if (*v == NULL)
7212 goto error;
7213 startout = PyUnicode_AS_UNICODE(*v);
7214 }
7215 else {
7216 /* Extend unicode object */
7217 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7218 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7219 PyErr_NoMemory();
7220 goto error;
7221 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007222 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 goto error;
7224 startout = PyUnicode_AS_UNICODE(*v) + n;
7225 }
7226
7227 /* Decode the byte string character per character */
7228 out = startout;
7229 while (in < endin)
7230 {
7231 /* Decode a character */
7232 insize = 1;
7233 do
7234 {
7235 outsize = MultiByteToWideChar(code_page, flags,
7236 in, insize,
7237 buffer, Py_ARRAY_LENGTH(buffer));
7238 if (outsize > 0)
7239 break;
7240 err = GetLastError();
7241 if (err != ERROR_NO_UNICODE_TRANSLATION
7242 && err != ERROR_INSUFFICIENT_BUFFER)
7243 {
7244 PyErr_SetFromWindowsErr(0);
7245 goto error;
7246 }
7247 insize++;
7248 }
7249 /* 4=maximum length of a UTF-8 sequence */
7250 while (insize <= 4 && (in + insize) <= endin);
7251
7252 if (outsize <= 0) {
7253 Py_ssize_t startinpos, endinpos, outpos;
7254
Victor Stinner7d00cc12014-03-17 23:08:06 +01007255 /* last character in partial decode? */
7256 if (in + insize >= endin && !final)
7257 break;
7258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 startinpos = in - startin;
7260 endinpos = startinpos + 1;
7261 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007262 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 errors, &errorHandler,
7264 encoding, reason,
7265 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007266 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 {
7268 goto error;
7269 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007270 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 }
7272 else {
7273 in += insize;
7274 memcpy(out, buffer, outsize * sizeof(wchar_t));
7275 out += outsize;
7276 }
7277 }
7278
7279 /* write a NUL character at the end */
7280 *out = 0;
7281
7282 /* Extend unicode object */
7283 outsize = out - startout;
7284 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007285 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007287 /* (in - startin) <= size and size is an int */
7288 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007289
7290error:
7291 Py_XDECREF(encoding_obj);
7292 Py_XDECREF(errorHandler);
7293 Py_XDECREF(exc);
7294 return ret;
7295}
7296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297static PyObject *
7298decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 const char *s, Py_ssize_t size,
7300 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301{
Victor Stinner76a31a62011-11-04 00:05:13 +01007302 PyObject *v = NULL;
7303 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 if (code_page < 0) {
7306 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7307 return NULL;
7308 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007309 if (size < 0) {
7310 PyErr_BadInternalCall();
7311 return NULL;
7312 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007313
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 do
7318 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007319#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007320 if (size > INT_MAX) {
7321 chunk_size = INT_MAX;
7322 final = 0;
7323 done = 0;
7324 }
7325 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007327 {
7328 chunk_size = (int)size;
7329 final = (consumed == NULL);
7330 done = 1;
7331 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner76a31a62011-11-04 00:05:13 +01007333 if (chunk_size == 0 && done) {
7334 if (v != NULL)
7335 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007336 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007338
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 converted = decode_code_page_strict(code_page, &v,
7340 s, chunk_size);
7341 if (converted == -2)
7342 converted = decode_code_page_errors(code_page, &v,
7343 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007344 errors, final);
7345 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007346
7347 if (converted < 0) {
7348 Py_XDECREF(v);
7349 return NULL;
7350 }
7351
7352 if (consumed)
7353 *consumed += converted;
7354
7355 s += converted;
7356 size -= converted;
7357 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007358
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007359 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360}
7361
Alexander Belopolsky40018472011-02-26 01:02:56 +00007362PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007363PyUnicode_DecodeCodePageStateful(int code_page,
7364 const char *s,
7365 Py_ssize_t size,
7366 const char *errors,
7367 Py_ssize_t *consumed)
7368{
7369 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7370}
7371
7372PyObject *
7373PyUnicode_DecodeMBCSStateful(const char *s,
7374 Py_ssize_t size,
7375 const char *errors,
7376 Py_ssize_t *consumed)
7377{
7378 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7379}
7380
7381PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382PyUnicode_DecodeMBCS(const char *s,
7383 Py_ssize_t size,
7384 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007385{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007386 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7387}
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389static DWORD
7390encode_code_page_flags(UINT code_page, const char *errors)
7391{
7392 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007393 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 }
7395 else if (code_page == CP_UTF7) {
7396 /* CP_UTF7 only supports flags=0 */
7397 return 0;
7398 }
7399 else {
7400 if (errors != NULL && strcmp(errors, "replace") == 0)
7401 return 0;
7402 else
7403 return WC_NO_BEST_FIT_CHARS;
7404 }
7405}
7406
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007407/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 * Encode a Unicode string to a Windows code page into a byte string in strict
7409 * mode.
7410 *
7411 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007412 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007414static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007415encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007416 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418{
Victor Stinner554f3f02010-06-16 23:33:54 +00007419 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 BOOL *pusedDefaultChar = &usedDefaultChar;
7421 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007422 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007423 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 const DWORD flags = encode_code_page_flags(code_page, NULL);
7425 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007426 /* Create a substring so that we can get the UTF-16 representation
7427 of just the slice under consideration. */
7428 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007433 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007435 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007436
Victor Stinner2fc507f2011-11-04 20:06:39 +01007437 substring = PyUnicode_Substring(unicode, offset, offset+len);
7438 if (substring == NULL)
7439 return -1;
7440 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7441 if (p == NULL) {
7442 Py_DECREF(substring);
7443 return -1;
7444 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007445 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007447 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007449 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 NULL, 0,
7451 NULL, pusedDefaultChar);
7452 if (outsize <= 0)
7453 goto error;
7454 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 if (pusedDefaultChar && *pusedDefaultChar) {
7456 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007463 if (*outbytes == NULL) {
7464 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468 }
7469 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 const Py_ssize_t n = PyBytes_Size(*outbytes);
7472 if (outsize > PY_SSIZE_T_MAX - n) {
7473 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7478 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482 }
7483
7484 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007486 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 out, outsize,
7488 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007489 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 if (outsize <= 0)
7491 goto error;
7492 if (pusedDefaultChar && *pusedDefaultChar)
7493 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7499 return -2;
7500 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007501 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007502}
7503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007505 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 * error handler.
7507 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007508 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 * -1 on other error.
7510 */
7511static int
7512encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007513 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007515{
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 Py_ssize_t pos = unicode_offset;
7518 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 /* Ideally, we should get reason from FormatMessage. This is the Windows
7520 2000 English version of the message. */
7521 const char *reason = "invalid character";
7522 /* 4=maximum length of a UTF-8 sequence */
7523 char buffer[4];
7524 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7525 Py_ssize_t outsize;
7526 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 PyObject *errorHandler = NULL;
7528 PyObject *exc = NULL;
7529 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007530 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *rep;
7533 int ret = -1;
7534
7535 assert(insize > 0);
7536
7537 encoding = code_page_name(code_page, &encoding_obj);
7538 if (encoding == NULL)
7539 return -1;
7540
7541 if (errors == NULL || strcmp(errors, "strict") == 0) {
7542 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7543 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007544 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 if (exc != NULL) {
7546 PyCodec_StrictErrors(exc);
7547 Py_DECREF(exc);
7548 }
7549 Py_XDECREF(encoding_obj);
7550 return -1;
7551 }
7552
7553 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7554 pusedDefaultChar = &usedDefaultChar;
7555 else
7556 pusedDefaultChar = NULL;
7557
7558 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7559 PyErr_NoMemory();
7560 goto error;
7561 }
7562 outsize = insize * Py_ARRAY_LENGTH(buffer);
7563
7564 if (*outbytes == NULL) {
7565 /* Create string object */
7566 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7567 if (*outbytes == NULL)
7568 goto error;
7569 out = PyBytes_AS_STRING(*outbytes);
7570 }
7571 else {
7572 /* Extend string object */
7573 Py_ssize_t n = PyBytes_Size(*outbytes);
7574 if (n > PY_SSIZE_T_MAX - outsize) {
7575 PyErr_NoMemory();
7576 goto error;
7577 }
7578 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7579 goto error;
7580 out = PyBytes_AS_STRING(*outbytes) + n;
7581 }
7582
7583 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007584 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7587 wchar_t chars[2];
7588 int charsize;
7589 if (ch < 0x10000) {
7590 chars[0] = (wchar_t)ch;
7591 charsize = 1;
7592 }
7593 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007594 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7595 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007596 charsize = 2;
7597 }
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 buffer, Py_ARRAY_LENGTH(buffer),
7602 NULL, pusedDefaultChar);
7603 if (outsize > 0) {
7604 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7605 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 memcpy(out, buffer, outsize);
7608 out += outsize;
7609 continue;
7610 }
7611 }
7612 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7613 PyErr_SetFromWindowsErr(0);
7614 goto error;
7615 }
7616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 rep = unicode_encode_call_errorhandler(
7618 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007619 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007620 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 if (rep == NULL)
7622 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007624
7625 if (PyBytes_Check(rep)) {
7626 outsize = PyBytes_GET_SIZE(rep);
7627 if (outsize != 1) {
7628 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7629 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7630 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7631 Py_DECREF(rep);
7632 goto error;
7633 }
7634 out = PyBytes_AS_STRING(*outbytes) + offset;
7635 }
7636 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7637 out += outsize;
7638 }
7639 else {
7640 Py_ssize_t i;
7641 enum PyUnicode_Kind kind;
7642 void *data;
7643
Benjamin Petersonbac79492012-01-14 13:34:47 -05007644 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 Py_DECREF(rep);
7646 goto error;
7647 }
7648
7649 outsize = PyUnicode_GET_LENGTH(rep);
7650 if (outsize != 1) {
7651 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7652 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7653 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7654 Py_DECREF(rep);
7655 goto error;
7656 }
7657 out = PyBytes_AS_STRING(*outbytes) + offset;
7658 }
7659 kind = PyUnicode_KIND(rep);
7660 data = PyUnicode_DATA(rep);
7661 for (i=0; i < outsize; i++) {
7662 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7663 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007664 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 encoding, unicode,
7666 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 "unable to encode error handler result to ASCII");
7668 Py_DECREF(rep);
7669 goto error;
7670 }
7671 *out = (unsigned char)ch;
7672 out++;
7673 }
7674 }
7675 Py_DECREF(rep);
7676 }
7677 /* write a NUL byte */
7678 *out = 0;
7679 outsize = out - PyBytes_AS_STRING(*outbytes);
7680 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7681 if (_PyBytes_Resize(outbytes, outsize) < 0)
7682 goto error;
7683 ret = 0;
7684
7685error:
7686 Py_XDECREF(encoding_obj);
7687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
7689 return ret;
7690}
7691
Victor Stinner3a50e702011-10-18 21:21:00 +02007692static PyObject *
7693encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007694 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 const char *errors)
7696{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007697 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007699 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007701
Victor Stinner29dacf22015-01-26 16:41:32 +01007702 if (!PyUnicode_Check(unicode)) {
7703 PyErr_BadArgument();
7704 return NULL;
7705 }
7706
Benjamin Petersonbac79492012-01-14 13:34:47 -05007707 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007708 return NULL;
7709 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 if (code_page < 0) {
7712 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7713 return NULL;
7714 }
7715
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007717 return PyBytes_FromStringAndSize(NULL, 0);
7718
Victor Stinner7581cef2011-11-03 22:32:33 +01007719 offset = 0;
7720 do
7721 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007722#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007723 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chunks. */
7725 if (len > INT_MAX/2) {
7726 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007727 done = 0;
7728 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007730#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007731 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007733 done = 1;
7734 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007735
Victor Stinner76a31a62011-11-04 00:05:13 +01007736 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007738 errors);
7739 if (ret == -2)
7740 ret = encode_code_page_errors(code_page, &outbytes,
7741 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 if (ret < 0) {
7744 Py_XDECREF(outbytes);
7745 return NULL;
7746 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007747
Victor Stinner7581cef2011-11-03 22:32:33 +01007748 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007749 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007750 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 return outbytes;
7753}
7754
7755PyObject *
7756PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7757 Py_ssize_t size,
7758 const char *errors)
7759{
Victor Stinner7581cef2011-11-03 22:32:33 +01007760 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007761 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 if (unicode == NULL)
7763 return NULL;
7764 res = encode_code_page(CP_ACP, unicode, errors);
7765 Py_DECREF(unicode);
7766 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767}
7768
7769PyObject *
7770PyUnicode_EncodeCodePage(int code_page,
7771 PyObject *unicode,
7772 const char *errors)
7773{
Victor Stinner7581cef2011-11-03 22:32:33 +01007774 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007775}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007776
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777PyObject *
7778PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007779{
Victor Stinner7581cef2011-11-03 22:32:33 +01007780 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007781}
7782
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007783#undef NEED_RETRY
7784
Steve Dowercc16be82016-09-08 10:35:16 -07007785#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787/* --- Character Mapping Codec -------------------------------------------- */
7788
Victor Stinnerfb161b12013-04-18 01:44:27 +02007789static int
7790charmap_decode_string(const char *s,
7791 Py_ssize_t size,
7792 PyObject *mapping,
7793 const char *errors,
7794 _PyUnicodeWriter *writer)
7795{
7796 const char *starts = s;
7797 const char *e;
7798 Py_ssize_t startinpos, endinpos;
7799 PyObject *errorHandler = NULL, *exc = NULL;
7800 Py_ssize_t maplen;
7801 enum PyUnicode_Kind mapkind;
7802 void *mapdata;
7803 Py_UCS4 x;
7804 unsigned char ch;
7805
7806 if (PyUnicode_READY(mapping) == -1)
7807 return -1;
7808
7809 maplen = PyUnicode_GET_LENGTH(mapping);
7810 mapdata = PyUnicode_DATA(mapping);
7811 mapkind = PyUnicode_KIND(mapping);
7812
7813 e = s + size;
7814
7815 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7816 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7817 * is disabled in encoding aliases, latin1 is preferred because
7818 * its implementation is faster. */
7819 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7820 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7821 Py_UCS4 maxchar = writer->maxchar;
7822
7823 assert (writer->kind == PyUnicode_1BYTE_KIND);
7824 while (s < e) {
7825 ch = *s;
7826 x = mapdata_ucs1[ch];
7827 if (x > maxchar) {
7828 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7829 goto onError;
7830 maxchar = writer->maxchar;
7831 outdata = (Py_UCS1 *)writer->data;
7832 }
7833 outdata[writer->pos] = x;
7834 writer->pos++;
7835 ++s;
7836 }
7837 return 0;
7838 }
7839
7840 while (s < e) {
7841 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7842 enum PyUnicode_Kind outkind = writer->kind;
7843 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7844 if (outkind == PyUnicode_1BYTE_KIND) {
7845 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7846 Py_UCS4 maxchar = writer->maxchar;
7847 while (s < e) {
7848 ch = *s;
7849 x = mapdata_ucs2[ch];
7850 if (x > maxchar)
7851 goto Error;
7852 outdata[writer->pos] = x;
7853 writer->pos++;
7854 ++s;
7855 }
7856 break;
7857 }
7858 else if (outkind == PyUnicode_2BYTE_KIND) {
7859 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7860 while (s < e) {
7861 ch = *s;
7862 x = mapdata_ucs2[ch];
7863 if (x == 0xFFFE)
7864 goto Error;
7865 outdata[writer->pos] = x;
7866 writer->pos++;
7867 ++s;
7868 }
7869 break;
7870 }
7871 }
7872 ch = *s;
7873
7874 if (ch < maplen)
7875 x = PyUnicode_READ(mapkind, mapdata, ch);
7876 else
7877 x = 0xfffe; /* invalid value */
7878Error:
7879 if (x == 0xfffe)
7880 {
7881 /* undefined mapping */
7882 startinpos = s-starts;
7883 endinpos = startinpos+1;
7884 if (unicode_decode_call_errorhandler_writer(
7885 errors, &errorHandler,
7886 "charmap", "character maps to <undefined>",
7887 &starts, &e, &startinpos, &endinpos, &exc, &s,
7888 writer)) {
7889 goto onError;
7890 }
7891 continue;
7892 }
7893
7894 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7895 goto onError;
7896 ++s;
7897 }
7898 Py_XDECREF(errorHandler);
7899 Py_XDECREF(exc);
7900 return 0;
7901
7902onError:
7903 Py_XDECREF(errorHandler);
7904 Py_XDECREF(exc);
7905 return -1;
7906}
7907
7908static int
7909charmap_decode_mapping(const char *s,
7910 Py_ssize_t size,
7911 PyObject *mapping,
7912 const char *errors,
7913 _PyUnicodeWriter *writer)
7914{
7915 const char *starts = s;
7916 const char *e;
7917 Py_ssize_t startinpos, endinpos;
7918 PyObject *errorHandler = NULL, *exc = NULL;
7919 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007920 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007921
7922 e = s + size;
7923
7924 while (s < e) {
7925 ch = *s;
7926
7927 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7928 key = PyLong_FromLong((long)ch);
7929 if (key == NULL)
7930 goto onError;
7931
7932 item = PyObject_GetItem(mapping, key);
7933 Py_DECREF(key);
7934 if (item == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 goto Undefined;
7939 } else
7940 goto onError;
7941 }
7942
7943 /* Apply mapping */
7944 if (item == Py_None)
7945 goto Undefined;
7946 if (PyLong_Check(item)) {
7947 long value = PyLong_AS_LONG(item);
7948 if (value == 0xFFFE)
7949 goto Undefined;
7950 if (value < 0 || value > MAX_UNICODE) {
7951 PyErr_Format(PyExc_TypeError,
7952 "character mapping must be in range(0x%lx)",
7953 (unsigned long)MAX_UNICODE + 1);
7954 goto onError;
7955 }
7956
7957 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7958 goto onError;
7959 }
7960 else if (PyUnicode_Check(item)) {
7961 if (PyUnicode_READY(item) == -1)
7962 goto onError;
7963 if (PyUnicode_GET_LENGTH(item) == 1) {
7964 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7965 if (value == 0xFFFE)
7966 goto Undefined;
7967 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7968 goto onError;
7969 }
7970 else {
7971 writer->overallocate = 1;
7972 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7973 goto onError;
7974 }
7975 }
7976 else {
7977 /* wrong return value */
7978 PyErr_SetString(PyExc_TypeError,
7979 "character mapping must return integer, None or str");
7980 goto onError;
7981 }
7982 Py_CLEAR(item);
7983 ++s;
7984 continue;
7985
7986Undefined:
7987 /* undefined mapping */
7988 Py_CLEAR(item);
7989 startinpos = s-starts;
7990 endinpos = startinpos+1;
7991 if (unicode_decode_call_errorhandler_writer(
7992 errors, &errorHandler,
7993 "charmap", "character maps to <undefined>",
7994 &starts, &e, &startinpos, &endinpos, &exc, &s,
7995 writer)) {
7996 goto onError;
7997 }
7998 }
7999 Py_XDECREF(errorHandler);
8000 Py_XDECREF(exc);
8001 return 0;
8002
8003onError:
8004 Py_XDECREF(item);
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return -1;
8008}
8009
Alexander Belopolsky40018472011-02-26 01:02:56 +00008010PyObject *
8011PyUnicode_DecodeCharmap(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008016 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008017
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 /* Default to Latin-1 */
8019 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008023 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008024 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008025 writer.min_length = size;
8026 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008028
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008029 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008030 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8031 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008032 }
8033 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8035 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008037 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008038
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008040 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 return NULL;
8042}
8043
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044/* Charmap encoding: the lookup table */
8045
Alexander Belopolsky40018472011-02-26 01:02:56 +00008046struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 PyObject_HEAD
8048 unsigned char level1[32];
8049 int count2, count3;
8050 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051};
8052
8053static PyObject*
8054encoding_map_size(PyObject *obj, PyObject* args)
8055{
8056 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059}
8060
8061static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 PyDoc_STR("Return the size (in bytes) of this object") },
8064 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065};
8066
8067static void
8068encoding_map_dealloc(PyObject* o)
8069{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071}
8072
8073static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "EncodingMap", /*tp_name*/
8076 sizeof(struct encoding_map), /*tp_basicsize*/
8077 0, /*tp_itemsize*/
8078 /* methods */
8079 encoding_map_dealloc, /*tp_dealloc*/
8080 0, /*tp_print*/
8081 0, /*tp_getattr*/
8082 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008083 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 0, /*tp_repr*/
8085 0, /*tp_as_number*/
8086 0, /*tp_as_sequence*/
8087 0, /*tp_as_mapping*/
8088 0, /*tp_hash*/
8089 0, /*tp_call*/
8090 0, /*tp_str*/
8091 0, /*tp_getattro*/
8092 0, /*tp_setattro*/
8093 0, /*tp_as_buffer*/
8094 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8095 0, /*tp_doc*/
8096 0, /*tp_traverse*/
8097 0, /*tp_clear*/
8098 0, /*tp_richcompare*/
8099 0, /*tp_weaklistoffset*/
8100 0, /*tp_iter*/
8101 0, /*tp_iternext*/
8102 encoding_map_methods, /*tp_methods*/
8103 0, /*tp_members*/
8104 0, /*tp_getset*/
8105 0, /*tp_base*/
8106 0, /*tp_dict*/
8107 0, /*tp_descr_get*/
8108 0, /*tp_descr_set*/
8109 0, /*tp_dictoffset*/
8110 0, /*tp_init*/
8111 0, /*tp_alloc*/
8112 0, /*tp_new*/
8113 0, /*tp_free*/
8114 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115};
8116
8117PyObject*
8118PyUnicode_BuildEncodingMap(PyObject* string)
8119{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyObject *result;
8121 struct encoding_map *mresult;
8122 int i;
8123 int need_dict = 0;
8124 unsigned char level1[32];
8125 unsigned char level2[512];
8126 unsigned char *mlevel1, *mlevel2, *mlevel3;
8127 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 int kind;
8129 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008130 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008133 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 PyErr_BadArgument();
8135 return NULL;
8136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 kind = PyUnicode_KIND(string);
8138 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008139 length = PyUnicode_GET_LENGTH(string);
8140 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 memset(level1, 0xFF, sizeof level1);
8142 memset(level2, 0xFF, sizeof level2);
8143
8144 /* If there isn't a one-to-one mapping of NULL to \0,
8145 or if there are non-BMP characters, we need to use
8146 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008149 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008151 ch = PyUnicode_READ(kind, data, i);
8152 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 need_dict = 1;
8154 break;
8155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008156 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 /* unmapped character */
8158 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159 l1 = ch >> 11;
8160 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161 if (level1[l1] == 0xFF)
8162 level1[l1] = count2++;
8163 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 }
8166
8167 if (count2 >= 0xFF || count3 >= 0xFF)
8168 need_dict = 1;
8169
8170 if (need_dict) {
8171 PyObject *result = PyDict_New();
8172 PyObject *key, *value;
8173 if (!result)
8174 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008175 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008177 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 if (!key || !value)
8179 goto failed1;
8180 if (PyDict_SetItem(result, key, value) == -1)
8181 goto failed1;
8182 Py_DECREF(key);
8183 Py_DECREF(value);
8184 }
8185 return result;
8186 failed1:
8187 Py_XDECREF(key);
8188 Py_XDECREF(value);
8189 Py_DECREF(result);
8190 return NULL;
8191 }
8192
8193 /* Create a three-level trie */
8194 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8195 16*count2 + 128*count3 - 1);
8196 if (!result)
8197 return PyErr_NoMemory();
8198 PyObject_Init(result, &EncodingMapType);
8199 mresult = (struct encoding_map*)result;
8200 mresult->count2 = count2;
8201 mresult->count3 = count3;
8202 mlevel1 = mresult->level1;
8203 mlevel2 = mresult->level23;
8204 mlevel3 = mresult->level23 + 16*count2;
8205 memcpy(mlevel1, level1, 32);
8206 memset(mlevel2, 0xFF, 16*count2);
8207 memset(mlevel3, 0, 128*count3);
8208 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008209 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008211 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8212 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 /* unmapped character */
8214 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 o1 = ch>>11;
8216 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 i2 = 16*mlevel1[o1] + o2;
8218 if (mlevel2[i2] == 0xFF)
8219 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008220 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 i3 = 128*mlevel2[i2] + o3;
8222 mlevel3[i3] = i;
8223 }
8224 return result;
8225}
8226
8227static int
Victor Stinner22168992011-11-20 17:09:18 +01008228encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229{
8230 struct encoding_map *map = (struct encoding_map*)mapping;
8231 int l1 = c>>11;
8232 int l2 = (c>>7) & 0xF;
8233 int l3 = c & 0x7F;
8234 int i;
8235
Victor Stinner22168992011-11-20 17:09:18 +01008236 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008238 if (c == 0)
8239 return 0;
8240 /* level 1*/
8241 i = map->level1[l1];
8242 if (i == 0xFF) {
8243 return -1;
8244 }
8245 /* level 2*/
8246 i = map->level23[16*i+l2];
8247 if (i == 0xFF) {
8248 return -1;
8249 }
8250 /* level 3 */
8251 i = map->level23[16*map->count2 + 128*i + l3];
8252 if (i == 0) {
8253 return -1;
8254 }
8255 return i;
8256}
8257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258/* Lookup the character ch in the mapping. If the character
8259 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008260 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008262charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263{
Christian Heimes217cfd12007-12-02 14:31:20 +00008264 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 PyObject *x;
8266
8267 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 x = PyObject_GetItem(mapping, w);
8270 Py_DECREF(w);
8271 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8273 /* No mapping found means: mapping is undefined. */
8274 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008275 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 } else
8277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008279 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008281 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 long value = PyLong_AS_LONG(x);
8283 if (value < 0 || value > 255) {
8284 PyErr_SetString(PyExc_TypeError,
8285 "character mapping must be in range(256)");
8286 Py_DECREF(x);
8287 return NULL;
8288 }
8289 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008291 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 /* wrong return value */
8295 PyErr_Format(PyExc_TypeError,
8296 "character mapping must return integer, bytes or None, not %.400s",
8297 x->ob_type->tp_name);
8298 Py_DECREF(x);
8299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
8301}
8302
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008304charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8307 /* exponentially overallocate to minimize reallocations */
8308 if (requiredsize < 2*outsize)
8309 requiredsize = 2*outsize;
8310 if (_PyBytes_Resize(outobj, requiredsize))
8311 return -1;
8312 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313}
8314
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008319 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 space is available. Return a new reference to the object that
8321 was put in the output buffer, or Py_None, if the mapping was undefined
8322 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008323 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008325charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 PyObject *rep;
8329 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008330 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331
Christian Heimes90aa7642007-12-19 02:45:37 +00008332 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 if (res == -1)
8336 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if (outsize<requiredsize)
8338 if (charmapencode_resize(outobj, outpos, requiredsize))
8339 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008340 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 outstart[(*outpos)++] = (char)res;
8342 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343 }
8344
8345 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 Py_DECREF(rep);
8350 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008351 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 if (PyLong_Check(rep)) {
8353 Py_ssize_t requiredsize = *outpos+1;
8354 if (outsize<requiredsize)
8355 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8356 Py_DECREF(rep);
8357 return enc_EXCEPTION;
8358 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008359 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 else {
8363 const char *repchars = PyBytes_AS_STRING(rep);
8364 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8365 Py_ssize_t requiredsize = *outpos+repsize;
8366 if (outsize<requiredsize)
8367 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8368 Py_DECREF(rep);
8369 return enc_EXCEPTION;
8370 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008371 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 memcpy(outstart + *outpos, repchars, repsize);
8373 *outpos += repsize;
8374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 Py_DECREF(rep);
8377 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378}
8379
8380/* handle an error in PyUnicode_EncodeCharmap
8381 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008382static int
8383charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008386 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008387 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388{
8389 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008390 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008391 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008392 enum PyUnicode_Kind kind;
8393 void *data;
8394 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 Py_ssize_t collstartpos = *inpos;
8397 Py_ssize_t collendpos = *inpos+1;
8398 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008399 const char *encoding = "charmap";
8400 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008401 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008403 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Benjamin Petersonbac79492012-01-14 13:34:47 -05008405 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 return -1;
8407 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* find all unencodable characters */
8409 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008411 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008413 val = encoding_map_lookup(ch, mapping);
8414 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 break;
8416 ++collendpos;
8417 continue;
8418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008419
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8421 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 if (rep==NULL)
8423 return -1;
8424 else if (rep!=Py_None) {
8425 Py_DECREF(rep);
8426 break;
8427 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431 /* cache callback name lookup
8432 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008433 if (*error_handler == _Py_ERROR_UNKNOWN)
8434 *error_handler = get_error_handler(errors);
8435
8436 switch (*error_handler) {
8437 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008438 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008439 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008440
8441 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 x = charmapencode_output('?', mapping, res, respos);
8444 if (x==enc_EXCEPTION) {
8445 return -1;
8446 }
8447 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008448 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
8450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 }
8452 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008453 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 *inpos = collendpos;
8455 break;
Victor Stinner50149202015-09-22 00:26:54 +02008456
8457 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 /* generate replacement (temporarily (mis)uses p) */
8459 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 char buffer[2+29+1+1];
8461 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008462 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 for (cp = buffer; *cp; ++cp) {
8464 x = charmapencode_output(*cp, mapping, res, respos);
8465 if (x==enc_EXCEPTION)
8466 return -1;
8467 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008468 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
8470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 }
8472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 *inpos = collendpos;
8474 break;
Victor Stinner50149202015-09-22 00:26:54 +02008475
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 default:
Victor Stinner50149202015-09-22 00:26:54 +02008477 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008478 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008482 if (PyBytes_Check(repunicode)) {
8483 /* Directly copy bytes result to output. */
8484 Py_ssize_t outsize = PyBytes_Size(*res);
8485 Py_ssize_t requiredsize;
8486 repsize = PyBytes_Size(repunicode);
8487 requiredsize = *respos + repsize;
8488 if (requiredsize > outsize)
8489 /* Make room for all additional bytes. */
8490 if (charmapencode_resize(res, respos, requiredsize)) {
8491 Py_DECREF(repunicode);
8492 return -1;
8493 }
8494 memcpy(PyBytes_AsString(*res) + *respos,
8495 PyBytes_AsString(repunicode), repsize);
8496 *respos += repsize;
8497 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008498 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008499 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008500 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008502 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 Py_DECREF(repunicode);
8504 return -1;
8505 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008506 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008507 data = PyUnicode_DATA(repunicode);
8508 kind = PyUnicode_KIND(repunicode);
8509 for (index = 0; index < repsize; index++) {
8510 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8511 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008513 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return -1;
8515 }
8516 else if (x==enc_FAILED) {
8517 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008518 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return -1;
8520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008521 }
8522 *inpos = newpos;
8523 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 }
8525 return 0;
8526}
8527
Alexander Belopolsky40018472011-02-26 01:02:56 +00008528PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529_PyUnicode_EncodeCharmap(PyObject *unicode,
8530 PyObject *mapping,
8531 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 /* output object */
8534 PyObject *res = NULL;
8535 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008536 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008537 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008539 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008540 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008542 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008543 void *data;
8544 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Benjamin Petersonbac79492012-01-14 13:34:47 -05008546 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008547 return NULL;
8548 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008549 data = PyUnicode_DATA(unicode);
8550 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008551
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 /* Default to Latin-1 */
8553 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 /* allocate enough for a simple encoding without
8557 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008558 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 if (res == NULL)
8560 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008561 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008565 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (x==enc_EXCEPTION) /* error */
8569 goto onError;
8570 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008571 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008573 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 &res, &respos)) {
8575 goto onError;
8576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008577 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 else
8579 /* done with this character => adjust input position */
8580 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008584 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008585 if (_PyBytes_Resize(&res, respos) < 0)
8586 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008589 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 return res;
8591
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 Py_XDECREF(res);
8594 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008595 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596 return NULL;
8597}
8598
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008599/* Deprecated */
8600PyObject *
8601PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8602 Py_ssize_t size,
8603 PyObject *mapping,
8604 const char *errors)
8605{
8606 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008607 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 if (unicode == NULL)
8609 return NULL;
8610 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8611 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008612 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613}
8614
Alexander Belopolsky40018472011-02-26 01:02:56 +00008615PyObject *
8616PyUnicode_AsCharmapString(PyObject *unicode,
8617 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
8619 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 PyErr_BadArgument();
8621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008623 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624}
8625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627static void
8628make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630 Py_ssize_t startpos, Py_ssize_t endpos,
8631 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 *exceptionObject = _PyUnicodeTranslateError_Create(
8635 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 }
8637 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8639 goto onError;
8640 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8641 goto onError;
8642 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8643 goto onError;
8644 return;
8645 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008646 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647 }
8648}
8649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650/* error handling callback helper:
8651 build arguments, call the callback and check the arguments,
8652 put the result into newpos and return the replacement string, which
8653 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008654static PyObject *
8655unicode_translate_call_errorhandler(const char *errors,
8656 PyObject **errorHandler,
8657 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659 Py_ssize_t startpos, Py_ssize_t endpos,
8660 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008662 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008664 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 PyObject *restuple;
8666 PyObject *resunicode;
8667
8668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 }
8673
8674 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008679 restuple = PyObject_CallFunctionObjArgs(
8680 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008684 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 Py_DECREF(restuple);
8686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008688 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 &resunicode, &i_newpos)) {
8690 Py_DECREF(restuple);
8691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 else
8696 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008698 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 Py_DECREF(restuple);
8700 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 Py_INCREF(resunicode);
8703 Py_DECREF(restuple);
8704 return resunicode;
8705}
8706
8707/* Lookup the character ch in the mapping and put the result in result,
8708 which must be decrefed by the caller.
8709 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008710static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712{
Christian Heimes217cfd12007-12-02 14:31:20 +00008713 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 PyObject *x;
8715
8716 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 x = PyObject_GetItem(mapping, w);
8719 Py_DECREF(w);
8720 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8722 /* No mapping found means: use 1:1 mapping. */
8723 PyErr_Clear();
8724 *result = NULL;
8725 return 0;
8726 } else
8727 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
8729 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 *result = x;
8731 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008733 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008735 if (value < 0 || value > MAX_UNICODE) {
8736 PyErr_Format(PyExc_ValueError,
8737 "character mapping must be in range(0x%x)",
8738 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(x);
8740 return -1;
8741 }
8742 *result = x;
8743 return 0;
8744 }
8745 else if (PyUnicode_Check(x)) {
8746 *result = x;
8747 return 0;
8748 }
8749 else {
8750 /* wrong return value */
8751 PyErr_SetString(PyExc_TypeError,
8752 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008753 Py_DECREF(x);
8754 return -1;
8755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756}
Victor Stinner1194ea02014-04-04 19:37:40 +02008757
8758/* lookup the character, write the result into the writer.
8759 Return 1 if the result was written into the writer, return 0 if the mapping
8760 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008762charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8763 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764{
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 PyObject *item;
8766
8767 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008769
8770 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008775 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008777
8778 if (item == Py_None) {
8779 Py_DECREF(item);
8780 return 0;
8781 }
8782
8783 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008784 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8785 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8786 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008787 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8788 Py_DECREF(item);
8789 return -1;
8790 }
8791 Py_DECREF(item);
8792 return 1;
8793 }
8794
8795 if (!PyUnicode_Check(item)) {
8796 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008798 }
8799
8800 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8801 Py_DECREF(item);
8802 return -1;
8803 }
8804
8805 Py_DECREF(item);
8806 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807}
8808
Victor Stinner89a76ab2014-04-05 11:44:04 +02008809static int
8810unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8811 Py_UCS1 *translate)
8812{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008813 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008814 int ret = 0;
8815
Victor Stinner89a76ab2014-04-05 11:44:04 +02008816 if (charmaptranslate_lookup(ch, mapping, &item)) {
8817 return -1;
8818 }
8819
8820 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008821 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008822 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008823 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008824 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008825 /* not found => default to 1:1 mapping */
8826 translate[ch] = ch;
8827 return 1;
8828 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008829 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008830 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008831 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8832 used it */
8833 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008834 /* invalid character or character outside ASCII:
8835 skip the fast translate */
8836 goto exit;
8837 }
8838 translate[ch] = (Py_UCS1)replace;
8839 }
8840 else if (PyUnicode_Check(item)) {
8841 Py_UCS4 replace;
8842
8843 if (PyUnicode_READY(item) == -1) {
8844 Py_DECREF(item);
8845 return -1;
8846 }
8847 if (PyUnicode_GET_LENGTH(item) != 1)
8848 goto exit;
8849
8850 replace = PyUnicode_READ_CHAR(item, 0);
8851 if (replace > 127)
8852 goto exit;
8853 translate[ch] = (Py_UCS1)replace;
8854 }
8855 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008856 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 goto exit;
8858 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 ret = 1;
8860
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 exit:
8862 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 return ret;
8864}
8865
8866/* Fast path for ascii => ascii translation. Return 1 if the whole string
8867 was translated into writer, return 0 if the input string was partially
8868 translated into writer, raise an exception and return -1 on error. */
8869static int
8870unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008871 _PyUnicodeWriter *writer, int ignore,
8872 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008873{
Victor Stinner872b2912014-04-05 14:27:07 +02008874 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008875 Py_ssize_t len;
8876 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008877 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008878
Victor Stinner89a76ab2014-04-05 11:44:04 +02008879 len = PyUnicode_GET_LENGTH(input);
8880
Victor Stinner872b2912014-04-05 14:27:07 +02008881 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008882
8883 in = PyUnicode_1BYTE_DATA(input);
8884 end = in + len;
8885
8886 assert(PyUnicode_IS_ASCII(writer->buffer));
8887 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8888 out = PyUnicode_1BYTE_DATA(writer->buffer);
8889
Victor Stinner872b2912014-04-05 14:27:07 +02008890 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008891 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008892 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008894 int translate = unicode_fast_translate_lookup(mapping, ch,
8895 ascii_table);
8896 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008898 if (translate == 0)
8899 goto exit;
8900 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008901 }
Victor Stinner872b2912014-04-05 14:27:07 +02008902 if (ch2 == 0xfe) {
8903 if (ignore)
8904 continue;
8905 goto exit;
8906 }
8907 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008909 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 }
Victor Stinner872b2912014-04-05 14:27:07 +02008911 res = 1;
8912
8913exit:
8914 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008915 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008916 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917}
8918
Victor Stinner3222da22015-10-01 22:07:32 +02008919static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920_PyUnicode_TranslateCharmap(PyObject *input,
8921 PyObject *mapping,
8922 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008925 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 Py_ssize_t size, i;
8927 int kind;
8928 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 _PyUnicodeWriter writer;
8930 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008931 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008932 PyObject *errorHandler = NULL;
8933 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008934 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008935 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 PyErr_BadArgument();
8939 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (PyUnicode_READY(input) == -1)
8943 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008944 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 kind = PyUnicode_KIND(input);
8946 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008948 if (size == 0)
8949 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008951 /* allocate enough for a simple 1:1 translation without
8952 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 _PyUnicodeWriter_Init(&writer);
8954 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956
Victor Stinner872b2912014-04-05 14:27:07 +02008957 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8958
Victor Stinner33798672016-03-01 21:59:58 +01008959 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008960 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008961 if (PyUnicode_IS_ASCII(input)) {
8962 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8963 if (res < 0) {
8964 _PyUnicodeWriter_Dealloc(&writer);
8965 return NULL;
8966 }
8967 if (res == 1)
8968 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 }
Victor Stinner33798672016-03-01 21:59:58 +01008970 else {
8971 i = 0;
8972 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008976 int translate;
8977 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8978 Py_ssize_t newpos;
8979 /* startpos for collecting untranslatable chars */
8980 Py_ssize_t collstart;
8981 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 ch = PyUnicode_READ(kind, data, i);
8985 translate = charmaptranslate_output(ch, mapping, &writer);
8986 if (translate < 0)
8987 goto onError;
8988
8989 if (translate != 0) {
8990 /* it worked => adjust input pointer */
8991 ++i;
8992 continue;
8993 }
8994
8995 /* untranslatable character */
8996 collstart = i;
8997 collend = i+1;
8998
8999 /* find all untranslatable characters */
9000 while (collend < size) {
9001 PyObject *x;
9002 ch = PyUnicode_READ(kind, data, collend);
9003 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009004 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 Py_XDECREF(x);
9006 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009008 ++collend;
9009 }
9010
9011 if (ignore) {
9012 i = collend;
9013 }
9014 else {
9015 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9016 reason, input, &exc,
9017 collstart, collend, &newpos);
9018 if (repunicode == NULL)
9019 goto onError;
9020 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009022 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009023 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 Py_DECREF(repunicode);
9025 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009026 }
9027 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009028 Py_XDECREF(exc);
9029 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009030 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009033 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034 Py_XDECREF(exc);
9035 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 return NULL;
9037}
9038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039/* Deprecated. Use PyUnicode_Translate instead. */
9040PyObject *
9041PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9042 Py_ssize_t size,
9043 PyObject *mapping,
9044 const char *errors)
9045{
Christian Heimes5f520f42012-09-11 14:03:25 +02009046 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009047 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 if (!unicode)
9049 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009050 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9051 Py_DECREF(unicode);
9052 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053}
9054
Alexander Belopolsky40018472011-02-26 01:02:56 +00009055PyObject *
9056PyUnicode_Translate(PyObject *str,
9057 PyObject *mapping,
9058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009060 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009061 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009062 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063}
Tim Petersced69f82003-09-16 20:30:58 +00009064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009066fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067{
9068 /* No need to call PyUnicode_READY(self) because this function is only
9069 called as a callback from fixup() which does it already. */
9070 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9071 const int kind = PyUnicode_KIND(self);
9072 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009073 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009074 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 Py_ssize_t i;
9076
9077 for (i = 0; i < len; ++i) {
9078 ch = PyUnicode_READ(kind, data, i);
9079 fixed = 0;
9080 if (ch > 127) {
9081 if (Py_UNICODE_ISSPACE(ch))
9082 fixed = ' ';
9083 else {
9084 const int decimal = Py_UNICODE_TODECIMAL(ch);
9085 if (decimal >= 0)
9086 fixed = '0' + decimal;
9087 }
9088 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009089 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009090 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 PyUnicode_WRITE(kind, data, i, fixed);
9092 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009093 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07009094 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 }
9097
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009098 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099}
9100
9101PyObject *
9102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9103{
9104 if (!PyUnicode_Check(unicode)) {
9105 PyErr_BadInternalCall();
9106 return NULL;
9107 }
9108 if (PyUnicode_READY(unicode) == -1)
9109 return NULL;
9110 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9111 /* If the string is already ASCII, just return the same string */
9112 Py_INCREF(unicode);
9113 return unicode;
9114 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009115 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116}
9117
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009118PyObject *
9119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9120 Py_ssize_t length)
9121{
Victor Stinnerf0124502011-11-21 23:12:56 +01009122 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009123 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009124 Py_UCS4 maxchar;
9125 enum PyUnicode_Kind kind;
9126 void *data;
9127
Victor Stinner99d7ad02012-02-22 13:37:39 +01009128 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009129 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009130 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009131 if (ch > 127) {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009134 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009135 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009136 }
9137 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009138
9139 /* Copy to a new string */
9140 decimal = PyUnicode_New(length, maxchar);
9141 if (decimal == NULL)
9142 return decimal;
9143 kind = PyUnicode_KIND(decimal);
9144 data = PyUnicode_DATA(decimal);
9145 /* Iterate over code points */
9146 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009147 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009148 if (ch > 127) {
9149 int decimal = Py_UNICODE_TODECIMAL(ch);
9150 if (decimal >= 0)
9151 ch = '0' + decimal;
9152 }
9153 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009155 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009157/* --- Decimal Encoder ---------------------------------------------------- */
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159int
9160PyUnicode_EncodeDecimal(Py_UNICODE *s,
9161 Py_ssize_t length,
9162 char *output,
9163 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009164{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009165 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009166 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009167 enum PyUnicode_Kind kind;
9168 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009169
9170 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 PyErr_BadArgument();
9172 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009173 }
9174
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009175 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009176 if (unicode == NULL)
9177 return -1;
9178
Victor Stinner42bf7752011-11-21 22:52:58 +01009179 kind = PyUnicode_KIND(unicode);
9180 data = PyUnicode_DATA(unicode);
9181
Victor Stinnerb84d7232011-11-22 01:50:07 +01009182 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009183 PyObject *exc;
9184 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009186 Py_ssize_t startpos;
9187
9188 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009189
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009192 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 decimal = Py_UNICODE_TODECIMAL(ch);
9196 if (decimal >= 0) {
9197 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009198 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 continue;
9200 }
9201 if (0 < ch && ch < 256) {
9202 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009203 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 continue;
9205 }
Victor Stinner6345be92011-11-25 20:09:01 +01009206
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009208 exc = NULL;
9209 raise_encode_exception(&exc, "decimal", unicode,
9210 startpos, startpos+1,
9211 "invalid decimal Unicode string");
9212 Py_XDECREF(exc);
9213 Py_DECREF(unicode);
9214 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009215 }
9216 /* 0-terminate the output string */
9217 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009219 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009220}
9221
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222/* --- Helpers ------------------------------------------------------------ */
9223
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009224/* helper macro to fixup start/end slice values */
9225#define ADJUST_INDICES(start, end, len) \
9226 if (end > len) \
9227 end = len; \
9228 else if (end < 0) { \
9229 end += len; \
9230 if (end < 0) \
9231 end = 0; \
9232 } \
9233 if (start < 0) { \
9234 start += len; \
9235 if (start < 0) \
9236 start = 0; \
9237 }
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009240any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009242 Py_ssize_t end,
9243 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 void *buf1, *buf2;
9247 Py_ssize_t len1, len2, result;
9248
9249 kind1 = PyUnicode_KIND(s1);
9250 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009251 if (kind1 < kind2)
9252 return -1;
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 len1 = PyUnicode_GET_LENGTH(s1);
9255 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256 ADJUST_INDICES(start, end, len1);
9257 if (end - start < len2)
9258 return -1;
9259
9260 buf1 = PyUnicode_DATA(s1);
9261 buf2 = PyUnicode_DATA(s2);
9262 if (len2 == 1) {
9263 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9264 result = findchar((const char *)buf1 + kind1*start,
9265 kind1, end - start, ch, direction);
9266 if (result == -1)
9267 return -1;
9268 else
9269 return start + result;
9270 }
9271
9272 if (kind2 != kind1) {
9273 buf2 = _PyUnicode_AsKind(s2, kind1);
9274 if (!buf2)
9275 return -2;
9276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277
Victor Stinner794d5672011-10-10 03:21:36 +02009278 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009279 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009280 case PyUnicode_1BYTE_KIND:
9281 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9282 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9283 else
9284 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9285 break;
9286 case PyUnicode_2BYTE_KIND:
9287 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9288 break;
9289 case PyUnicode_4BYTE_KIND:
9290 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9291 break;
9292 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009293 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009294 }
9295 }
9296 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009297 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009298 case PyUnicode_1BYTE_KIND:
9299 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9300 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9301 else
9302 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9303 break;
9304 case PyUnicode_2BYTE_KIND:
9305 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9306 break;
9307 case PyUnicode_4BYTE_KIND:
9308 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9309 break;
9310 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009311 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 PyMem_Free(buf2);
9317
9318 return result;
9319}
9320
9321Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009322_PyUnicode_InsertThousandsGrouping(
9323 PyObject *unicode, Py_ssize_t index,
9324 Py_ssize_t n_buffer,
9325 void *digits, Py_ssize_t n_digits,
9326 Py_ssize_t min_width,
9327 const char *grouping, PyObject *thousands_sep,
9328 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329{
Victor Stinner41a863c2012-02-24 00:37:51 +01009330 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009331 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 Py_ssize_t thousands_sep_len;
9333 Py_ssize_t len;
9334
9335 if (unicode != NULL) {
9336 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009337 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009338 }
9339 else {
9340 kind = PyUnicode_1BYTE_KIND;
9341 data = NULL;
9342 }
9343 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9344 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9345 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9346 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009347 if (thousands_sep_kind < kind) {
9348 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9349 if (!thousands_sep_data)
9350 return -1;
9351 }
9352 else {
9353 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9354 if (!data)
9355 return -1;
9356 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009357 }
9358
Benjamin Petersonead6b532011-12-20 17:23:42 -06009359 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009361 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009362 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009363 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009364 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009365 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009366 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009367 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009368 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009369 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009370 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009373 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009374 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009376 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009377 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009380 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009382 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 break;
9384 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009385 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009387 if (unicode != NULL && thousands_sep_kind != kind) {
9388 if (thousands_sep_kind < kind)
9389 PyMem_Free(thousands_sep_data);
9390 else
9391 PyMem_Free(data);
9392 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 if (unicode == NULL) {
9394 *maxchar = 127;
9395 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009396 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009397 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
9399 }
9400 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401}
9402
9403
Alexander Belopolsky40018472011-02-26 01:02:56 +00009404Py_ssize_t
9405PyUnicode_Count(PyObject *str,
9406 PyObject *substr,
9407 Py_ssize_t start,
9408 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009410 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 void *buf1 = NULL, *buf2 = NULL;
9413 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009417
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009418 kind1 = PyUnicode_KIND(str);
9419 kind2 = PyUnicode_KIND(substr);
9420 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009423 len1 = PyUnicode_GET_LENGTH(str);
9424 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009426 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009428
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009429 buf1 = PyUnicode_DATA(str);
9430 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009432 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009433 if (!buf2)
9434 goto onError;
9435 }
9436
9437 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009439 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009440 result = asciilib_count(
9441 ((Py_UCS1*)buf1) + start, end - start,
9442 buf2, len2, PY_SSIZE_T_MAX
9443 );
9444 else
9445 result = ucs1lib_count(
9446 ((Py_UCS1*)buf1) + start, end - start,
9447 buf2, len2, PY_SSIZE_T_MAX
9448 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 result = ucs2lib_count(
9452 ((Py_UCS2*)buf1) + start, end - start,
9453 buf2, len2, PY_SSIZE_T_MAX
9454 );
9455 break;
9456 case PyUnicode_4BYTE_KIND:
9457 result = ucs4lib_count(
9458 ((Py_UCS4*)buf1) + start, end - start,
9459 buf2, len2, PY_SSIZE_T_MAX
9460 );
9461 break;
9462 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009463 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009466 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 PyMem_Free(buf2);
9468
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 PyMem_Free(buf2);
9473 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Alexander Belopolsky40018472011-02-26 01:02:56 +00009476Py_ssize_t
9477PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009478 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009479 Py_ssize_t start,
9480 Py_ssize_t end,
9481 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009483 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009485
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009486 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487}
9488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489Py_ssize_t
9490PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9491 Py_ssize_t start, Py_ssize_t end,
9492 int direction)
9493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009495 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 if (PyUnicode_READY(str) == -1)
9497 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009498 len = PyUnicode_GET_LENGTH(str);
9499 ADJUST_INDICES(start, end, len);
9500 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009501 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009503 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9504 kind, end-start, ch, direction);
9505 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009507 else
9508 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509}
9510
Alexander Belopolsky40018472011-02-26 01:02:56 +00009511static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009512tailmatch(PyObject *self,
9513 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009514 Py_ssize_t start,
9515 Py_ssize_t end,
9516 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 int kind_self;
9519 int kind_sub;
9520 void *data_self;
9521 void *data_sub;
9522 Py_ssize_t offset;
9523 Py_ssize_t i;
9524 Py_ssize_t end_sub;
9525
9526 if (PyUnicode_READY(self) == -1 ||
9527 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009528 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9531 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009535 if (PyUnicode_GET_LENGTH(substring) == 0)
9536 return 1;
9537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 kind_self = PyUnicode_KIND(self);
9539 data_self = PyUnicode_DATA(self);
9540 kind_sub = PyUnicode_KIND(substring);
9541 data_sub = PyUnicode_DATA(substring);
9542 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9543
9544 if (direction > 0)
9545 offset = end;
9546 else
9547 offset = start;
9548
9549 if (PyUnicode_READ(kind_self, data_self, offset) ==
9550 PyUnicode_READ(kind_sub, data_sub, 0) &&
9551 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9552 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9553 /* If both are of the same kind, memcmp is sufficient */
9554 if (kind_self == kind_sub) {
9555 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009556 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 data_sub,
9558 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009559 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009561 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 else {
9563 /* We do not need to compare 0 and len(substring)-1 because
9564 the if statement above ensured already that they are equal
9565 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 for (i = 1; i < end_sub; ++i) {
9567 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9568 PyUnicode_READ(kind_sub, data_sub, i))
9569 return 0;
9570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573 }
9574
9575 return 0;
9576}
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578Py_ssize_t
9579PyUnicode_Tailmatch(PyObject *str,
9580 PyObject *substr,
9581 Py_ssize_t start,
9582 Py_ssize_t end,
9583 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589}
9590
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591/* Apply fixfct filter to the Unicode object self and return a
9592 reference to the modified object */
9593
Alexander Belopolsky40018472011-02-26 01:02:56 +00009594static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009595fixup(PyObject *self,
9596 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 PyObject *u;
9599 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009600 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009602 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009605 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 /* fix functions return the new maximum character in a string,
9608 if the kind of the resulting unicode object does not change,
9609 everything is fine. Otherwise we need to change the string kind
9610 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009611 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009612
9613 if (maxchar_new == 0) {
9614 /* no changes */;
9615 if (PyUnicode_CheckExact(self)) {
9616 Py_DECREF(u);
9617 Py_INCREF(self);
9618 return self;
9619 }
9620 else
9621 return u;
9622 }
9623
Victor Stinnere6abb482012-05-02 01:15:40 +02009624 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625
Victor Stinnereaab6042011-12-11 22:22:39 +01009626 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009628
9629 /* In case the maximum character changed, we need to
9630 convert the string to the new category. */
9631 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9632 if (v == NULL) {
9633 Py_DECREF(u);
9634 return NULL;
9635 }
9636 if (maxchar_new > maxchar_old) {
9637 /* If the maxchar increased so that the kind changed, not all
9638 characters are representable anymore and we need to fix the
9639 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009640 _PyUnicode_FastCopyCharacters(v, 0,
9641 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009642 maxchar_old = fixfct(v);
9643 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 }
9645 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009646 _PyUnicode_FastCopyCharacters(v, 0,
9647 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009649 Py_DECREF(u);
9650 assert(_PyUnicode_CheckConsistency(v, 1));
9651 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009654static PyObject *
9655ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9658 char *resdata, *data = PyUnicode_DATA(self);
9659 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009660
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661 res = PyUnicode_New(len, 127);
9662 if (res == NULL)
9663 return NULL;
9664 resdata = PyUnicode_DATA(res);
9665 if (lower)
9666 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 _Py_bytes_upper(resdata, data, len);
9669 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670}
9671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009673handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 Py_ssize_t j;
9676 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009677 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009679
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9681
9682 where ! is a negation and \p{xxx} is a character with property xxx.
9683 */
9684 for (j = i - 1; j >= 0; j--) {
9685 c = PyUnicode_READ(kind, data, j);
9686 if (!_PyUnicode_IsCaseIgnorable(c))
9687 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9690 if (final_sigma) {
9691 for (j = i + 1; j < length; j++) {
9692 c = PyUnicode_READ(kind, data, j);
9693 if (!_PyUnicode_IsCaseIgnorable(c))
9694 break;
9695 }
9696 final_sigma = j == length || !_PyUnicode_IsCased(c);
9697 }
9698 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701static int
9702lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9703 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 /* Obscure special case. */
9706 if (c == 0x3A3) {
9707 mapped[0] = handle_capital_sigma(kind, data, length, i);
9708 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711}
9712
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713static Py_ssize_t
9714do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 Py_ssize_t i, k = 0;
9717 int n_res, j;
9718 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 c = PyUnicode_READ(kind, data, 0);
9721 n_res = _PyUnicode_ToUpperFull(c, mapped);
9722 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009723 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009724 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 for (i = 1; i < length; i++) {
9727 c = PyUnicode_READ(kind, data, i);
9728 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9729 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009730 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009732 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009733 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737static Py_ssize_t
9738do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9739 Py_ssize_t i, k = 0;
9740
9741 for (i = 0; i < length; i++) {
9742 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9743 int n_res, j;
9744 if (Py_UNICODE_ISUPPER(c)) {
9745 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9746 }
9747 else if (Py_UNICODE_ISLOWER(c)) {
9748 n_res = _PyUnicode_ToUpperFull(c, mapped);
9749 }
9750 else {
9751 n_res = 1;
9752 mapped[0] = c;
9753 }
9754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
9757 }
9758 }
9759 return k;
9760}
9761
9762static Py_ssize_t
9763do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9764 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766 Py_ssize_t i, k = 0;
9767
9768 for (i = 0; i < length; i++) {
9769 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9770 int n_res, j;
9771 if (lower)
9772 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9773 else
9774 n_res = _PyUnicode_ToUpperFull(c, mapped);
9775 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009776 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 res[k++] = mapped[j];
9778 }
9779 }
9780 return k;
9781}
9782
9783static Py_ssize_t
9784do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9785{
9786 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9787}
9788
9789static Py_ssize_t
9790do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9793}
9794
Benjamin Petersone51757f2012-01-12 21:10:29 -05009795static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009796do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 Py_ssize_t i, k = 0;
9799
9800 for (i = 0; i < length; i++) {
9801 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9802 Py_UCS4 mapped[3];
9803 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9804 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009806 res[k++] = mapped[j];
9807 }
9808 }
9809 return k;
9810}
9811
9812static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009813do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9814{
9815 Py_ssize_t i, k = 0;
9816 int previous_is_cased;
9817
9818 previous_is_cased = 0;
9819 for (i = 0; i < length; i++) {
9820 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9821 Py_UCS4 mapped[3];
9822 int n_res, j;
9823
9824 if (previous_is_cased)
9825 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9826 else
9827 n_res = _PyUnicode_ToTitleFull(c, mapped);
9828
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009831 res[k++] = mapped[j];
9832 }
9833
9834 previous_is_cased = _PyUnicode_IsCased(c);
9835 }
9836 return k;
9837}
9838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839static PyObject *
9840case_operation(PyObject *self,
9841 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9842{
9843 PyObject *res = NULL;
9844 Py_ssize_t length, newlength = 0;
9845 int kind, outkind;
9846 void *data, *outdata;
9847 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9848
Benjamin Petersoneea48462012-01-16 14:28:50 -05009849 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009850
9851 kind = PyUnicode_KIND(self);
9852 data = PyUnicode_DATA(self);
9853 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009854 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009855 PyErr_SetString(PyExc_OverflowError, "string is too long");
9856 return NULL;
9857 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009858 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 if (tmp == NULL)
9860 return PyErr_NoMemory();
9861 newlength = perform(kind, data, length, tmp, &maxchar);
9862 res = PyUnicode_New(newlength, maxchar);
9863 if (res == NULL)
9864 goto leave;
9865 tmpend = tmp + newlength;
9866 outdata = PyUnicode_DATA(res);
9867 outkind = PyUnicode_KIND(res);
9868 switch (outkind) {
9869 case PyUnicode_1BYTE_KIND:
9870 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9871 break;
9872 case PyUnicode_2BYTE_KIND:
9873 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9874 break;
9875 case PyUnicode_4BYTE_KIND:
9876 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9877 break;
9878 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009879 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 }
9881 leave:
9882 PyMem_FREE(tmp);
9883 return res;
9884}
9885
Tim Peters8ce9f162004-08-27 01:49:32 +00009886PyObject *
9887PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009889 PyObject *res;
9890 PyObject *fseq;
9891 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009892 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009894 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009897 }
9898
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009899 /* NOTE: the following code can't call back into Python code,
9900 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009901 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009902
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009903 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009905 res = _PyUnicode_JoinArray(separator, items, seqlen);
9906 Py_DECREF(fseq);
9907 return res;
9908}
9909
9910PyObject *
9911_PyUnicode_JoinArray(PyObject *separator, PyObject **items, Py_ssize_t seqlen)
9912{
9913 PyObject *res = NULL; /* the result */
9914 PyObject *sep = NULL;
9915 Py_ssize_t seplen;
9916 PyObject *item;
9917 Py_ssize_t sz, i, res_offset;
9918 Py_UCS4 maxchar;
9919 Py_UCS4 item_maxchar;
9920 int use_memcpy;
9921 unsigned char *res_data = NULL, *sep_data = NULL;
9922 PyObject *last_obj;
9923 unsigned int kind = 0;
9924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If empty sequence, return u"". */
9926 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009927 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009929
Tim Peters05eba1f2004-08-27 21:32:02 +00009930 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009931 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009932 if (seqlen == 1) {
9933 if (PyUnicode_CheckExact(items[0])) {
9934 res = items[0];
9935 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009936 return res;
9937 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009938 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009939 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009940 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009942 /* Set up sep and seplen */
9943 if (separator == NULL) {
9944 /* fall back to a blank space separator */
9945 sep = PyUnicode_FromOrdinal(' ');
9946 if (!sep)
9947 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009948 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009949 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009950 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009951 else {
9952 if (!PyUnicode_Check(separator)) {
9953 PyErr_Format(PyExc_TypeError,
9954 "separator: expected str instance,"
9955 " %.80s found",
9956 Py_TYPE(separator)->tp_name);
9957 goto onError;
9958 }
9959 if (PyUnicode_READY(separator))
9960 goto onError;
9961 sep = separator;
9962 seplen = PyUnicode_GET_LENGTH(separator);
9963 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9964 /* inc refcount to keep this code path symmetric with the
9965 above case of a blank separator */
9966 Py_INCREF(sep);
9967 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009968 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009969 }
9970
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009971 /* There are at least two things to join, or else we have a subclass
9972 * of str in the sequence.
9973 * Do a pre-pass to figure out the total amount of space we'll
9974 * need (sz), and see whether all argument are strings.
9975 */
9976 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977#ifdef Py_DEBUG
9978 use_memcpy = 0;
9979#else
9980 use_memcpy = 1;
9981#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009982 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +08009983 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009984 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 if (!PyUnicode_Check(item)) {
9986 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009987 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 " %.80s found",
9989 i, Py_TYPE(item)->tp_name);
9990 goto onError;
9991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (PyUnicode_READY(item) == -1)
9993 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +08009994 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009996 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +08009997 if (i != 0) {
9998 add_sz += seplen;
9999 }
10000 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010002 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 goto onError;
10004 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010005 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010006 if (use_memcpy && last_obj != NULL) {
10007 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10008 use_memcpy = 0;
10009 }
10010 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010011 }
Tim Petersced69f82003-09-16 20:30:58 +000010012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 if (res == NULL)
10015 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010016
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010018#ifdef Py_DEBUG
10019 use_memcpy = 0;
10020#else
10021 if (use_memcpy) {
10022 res_data = PyUnicode_1BYTE_DATA(res);
10023 kind = PyUnicode_KIND(res);
10024 if (seplen != 0)
10025 sep_data = PyUnicode_1BYTE_DATA(sep);
10026 }
10027#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010028 if (use_memcpy) {
10029 for (i = 0; i < seqlen; ++i) {
10030 Py_ssize_t itemlen;
10031 item = items[i];
10032
10033 /* Copy item, and maybe the separator. */
10034 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010035 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010036 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010037 kind * seplen);
10038 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010040
10041 itemlen = PyUnicode_GET_LENGTH(item);
10042 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010043 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010044 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010045 kind * itemlen);
10046 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010048 }
10049 assert(res_data == PyUnicode_1BYTE_DATA(res)
10050 + kind * PyUnicode_GET_LENGTH(res));
10051 }
10052 else {
10053 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10054 Py_ssize_t itemlen;
10055 item = items[i];
10056
10057 /* Copy item, and maybe the separator. */
10058 if (i && seplen != 0) {
10059 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10060 res_offset += seplen;
10061 }
10062
10063 itemlen = PyUnicode_GET_LENGTH(item);
10064 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010065 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010066 res_offset += itemlen;
10067 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010068 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010070 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010073 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010078 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 return NULL;
10080}
10081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082#define FILL(kind, data, value, start, length) \
10083 do { \
10084 Py_ssize_t i_ = 0; \
10085 assert(kind != PyUnicode_WCHAR_KIND); \
10086 switch ((kind)) { \
10087 case PyUnicode_1BYTE_KIND: { \
10088 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010089 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 break; \
10091 } \
10092 case PyUnicode_2BYTE_KIND: { \
10093 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10094 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10095 break; \
10096 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010097 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10099 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10100 break; \
10101 } \
Barry Warsawb2e57942017-09-14 18:13:16 -070010102 default: Py_UNREACHABLE(); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 } \
10104 } while (0)
10105
Victor Stinnerd3f08822012-05-29 12:57:52 +020010106void
10107_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10108 Py_UCS4 fill_char)
10109{
10110 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10111 const void *data = PyUnicode_DATA(unicode);
10112 assert(PyUnicode_IS_READY(unicode));
10113 assert(unicode_modifiable(unicode));
10114 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10115 assert(start >= 0);
10116 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10117 FILL(kind, data, fill_char, start, length);
10118}
10119
Victor Stinner3fe55312012-01-04 00:33:50 +010010120Py_ssize_t
10121PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10122 Py_UCS4 fill_char)
10123{
10124 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010125
10126 if (!PyUnicode_Check(unicode)) {
10127 PyErr_BadInternalCall();
10128 return -1;
10129 }
10130 if (PyUnicode_READY(unicode) == -1)
10131 return -1;
10132 if (unicode_check_modifiable(unicode))
10133 return -1;
10134
Victor Stinnerd3f08822012-05-29 12:57:52 +020010135 if (start < 0) {
10136 PyErr_SetString(PyExc_IndexError, "string index out of range");
10137 return -1;
10138 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010139 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10140 PyErr_SetString(PyExc_ValueError,
10141 "fill character is bigger than "
10142 "the string maximum character");
10143 return -1;
10144 }
10145
10146 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10147 length = Py_MIN(maxlen, length);
10148 if (length <= 0)
10149 return 0;
10150
Victor Stinnerd3f08822012-05-29 12:57:52 +020010151 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010152 return length;
10153}
10154
Victor Stinner9310abb2011-10-05 00:59:23 +020010155static PyObject *
10156pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010157 Py_ssize_t left,
10158 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 PyObject *u;
10162 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010163 int kind;
10164 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165
10166 if (left < 0)
10167 left = 0;
10168 if (right < 0)
10169 right = 0;
10170
Victor Stinnerc4b49542011-12-11 22:44:26 +010010171 if (left == 0 && right == 0)
10172 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10175 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010176 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10177 return NULL;
10178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010180 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010182 if (!u)
10183 return NULL;
10184
10185 kind = PyUnicode_KIND(u);
10186 data = PyUnicode_DATA(u);
10187 if (left)
10188 FILL(kind, data, fill, 0, left);
10189 if (right)
10190 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010191 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010192 assert(_PyUnicode_CheckConsistency(u, 1));
10193 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194}
10195
Alexander Belopolsky40018472011-02-26 01:02:56 +000010196PyObject *
10197PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010201 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203
Benjamin Petersonead6b532011-12-20 17:23:42 -060010204 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206 if (PyUnicode_IS_ASCII(string))
10207 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010208 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209 PyUnicode_GET_LENGTH(string), keepends);
10210 else
10211 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010212 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 break;
10215 case PyUnicode_2BYTE_KIND:
10216 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 PyUnicode_GET_LENGTH(string), keepends);
10219 break;
10220 case PyUnicode_4BYTE_KIND:
10221 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyUnicode_GET_LENGTH(string), keepends);
10224 break;
10225 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010226 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229}
10230
Alexander Belopolsky40018472011-02-26 01:02:56 +000010231static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010232split(PyObject *self,
10233 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010234 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010236 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 void *buf1, *buf2;
10238 Py_ssize_t len1, len2;
10239 PyObject* out;
10240
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010242 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (PyUnicode_READY(self) == -1)
10245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010248 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010250 if (PyUnicode_IS_ASCII(self))
10251 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010252 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010253 PyUnicode_GET_LENGTH(self), maxcount
10254 );
10255 else
10256 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 PyUnicode_GET_LENGTH(self), maxcount
10259 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 case PyUnicode_2BYTE_KIND:
10261 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
10265 case PyUnicode_4BYTE_KIND:
10266 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010271 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 }
10273
10274 if (PyUnicode_READY(substring) == -1)
10275 return NULL;
10276
10277 kind1 = PyUnicode_KIND(self);
10278 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 len1 = PyUnicode_GET_LENGTH(self);
10280 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010281 if (kind1 < kind2 || len1 < len2) {
10282 out = PyList_New(1);
10283 if (out == NULL)
10284 return NULL;
10285 Py_INCREF(self);
10286 PyList_SET_ITEM(out, 0, self);
10287 return out;
10288 }
10289 buf1 = PyUnicode_DATA(self);
10290 buf2 = PyUnicode_DATA(substring);
10291 if (kind2 != kind1) {
10292 buf2 = _PyUnicode_AsKind(substring, kind1);
10293 if (!buf2)
10294 return NULL;
10295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010297 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10300 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 else
10303 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 break;
10306 case PyUnicode_2BYTE_KIND:
10307 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 break;
10310 case PyUnicode_4BYTE_KIND:
10311 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 break;
10314 default:
10315 out = NULL;
10316 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010317 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 PyMem_Free(buf2);
10319 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010323rsplit(PyObject *self,
10324 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010325 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010326{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010327 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 void *buf1, *buf2;
10329 Py_ssize_t len1, len2;
10330 PyObject* out;
10331
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010332 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010333 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (PyUnicode_READY(self) == -1)
10336 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010339 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(self))
10342 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 PyUnicode_GET_LENGTH(self), maxcount
10345 );
10346 else
10347 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(self), maxcount
10350 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 case PyUnicode_2BYTE_KIND:
10352 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
10356 case PyUnicode_4BYTE_KIND:
10357 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010362 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 }
10364
10365 if (PyUnicode_READY(substring) == -1)
10366 return NULL;
10367
10368 kind1 = PyUnicode_KIND(self);
10369 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 len1 = PyUnicode_GET_LENGTH(self);
10371 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010372 if (kind1 < kind2 || len1 < len2) {
10373 out = PyList_New(1);
10374 if (out == NULL)
10375 return NULL;
10376 Py_INCREF(self);
10377 PyList_SET_ITEM(out, 0, self);
10378 return out;
10379 }
10380 buf1 = PyUnicode_DATA(self);
10381 buf2 = PyUnicode_DATA(substring);
10382 if (kind2 != kind1) {
10383 buf2 = _PyUnicode_AsKind(substring, kind1);
10384 if (!buf2)
10385 return NULL;
10386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010388 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10391 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010392 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393 else
10394 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 break;
10397 case PyUnicode_2BYTE_KIND:
10398 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 break;
10401 case PyUnicode_4BYTE_KIND:
10402 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 break;
10405 default:
10406 out = NULL;
10407 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010408 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 PyMem_Free(buf2);
10410 return out;
10411}
10412
10413static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010414anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10415 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010417 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10420 return asciilib_find(buf1, len1, buf2, len2, offset);
10421 else
10422 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 case PyUnicode_2BYTE_KIND:
10424 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10425 case PyUnicode_4BYTE_KIND:
10426 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10427 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010428 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429}
10430
10431static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10433 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010435 switch (kind) {
10436 case PyUnicode_1BYTE_KIND:
10437 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10438 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10439 else
10440 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10441 case PyUnicode_2BYTE_KIND:
10442 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10443 case PyUnicode_4BYTE_KIND:
10444 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10445 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010446 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010447}
10448
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010449static void
10450replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10451 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10452{
10453 int kind = PyUnicode_KIND(u);
10454 void *data = PyUnicode_DATA(u);
10455 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10456 if (kind == PyUnicode_1BYTE_KIND) {
10457 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10458 (Py_UCS1 *)data + len,
10459 u1, u2, maxcount);
10460 }
10461 else if (kind == PyUnicode_2BYTE_KIND) {
10462 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10463 (Py_UCS2 *)data + len,
10464 u1, u2, maxcount);
10465 }
10466 else {
10467 assert(kind == PyUnicode_4BYTE_KIND);
10468 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10469 (Py_UCS4 *)data + len,
10470 u1, u2, maxcount);
10471 }
10472}
10473
Alexander Belopolsky40018472011-02-26 01:02:56 +000010474static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475replace(PyObject *self, PyObject *str1,
10476 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 PyObject *u;
10479 char *sbuf = PyUnicode_DATA(self);
10480 char *buf1 = PyUnicode_DATA(str1);
10481 char *buf2 = PyUnicode_DATA(str2);
10482 int srelease = 0, release1 = 0, release2 = 0;
10483 int skind = PyUnicode_KIND(self);
10484 int kind1 = PyUnicode_KIND(str1);
10485 int kind2 = PyUnicode_KIND(str2);
10486 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10487 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10488 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010490 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491
10492 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010493 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010495 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Victor Stinner59de0ee2011-10-07 10:01:28 +020010497 if (str1 == str2)
10498 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499
Victor Stinner49a0a212011-10-12 23:46:10 +020010500 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010501 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10502 if (maxchar < maxchar_str1)
10503 /* substring too wide to be present */
10504 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10506 /* Replacing str1 with str2 may cause a maxchar reduction in the
10507 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010509 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010514 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010518 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010519
Victor Stinner69ed0f42013-04-09 21:48:24 +020010520 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010521 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010522 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010524 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10530 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010531 }
10532 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 int rkind = skind;
10534 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (kind1 < rkind) {
10538 /* widen substring */
10539 buf1 = _PyUnicode_AsKind(str1, rkind);
10540 if (!buf1) goto error;
10541 release1 = 1;
10542 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 if (i < 0)
10545 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (rkind > kind2) {
10547 /* widen replacement */
10548 buf2 = _PyUnicode_AsKind(str2, rkind);
10549 if (!buf2) goto error;
10550 release2 = 1;
10551 }
10552 else if (rkind < kind2) {
10553 /* widen self and buf1 */
10554 rkind = kind2;
10555 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010556 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 sbuf = _PyUnicode_AsKind(self, rkind);
10558 if (!sbuf) goto error;
10559 srelease = 1;
10560 buf1 = _PyUnicode_AsKind(str1, rkind);
10561 if (!buf1) goto error;
10562 release1 = 1;
10563 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 u = PyUnicode_New(slen, maxchar);
10565 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 assert(PyUnicode_KIND(u) == rkind);
10568 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010569
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010570 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010571 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010572 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010574 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576
10577 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010580 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010581 if (i == -1)
10582 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010589 }
10590 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010592 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 int rkind = skind;
10594 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 buf1 = _PyUnicode_AsKind(str1, rkind);
10599 if (!buf1) goto error;
10600 release1 = 1;
10601 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010603 if (n == 0)
10604 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf2 = _PyUnicode_AsKind(str2, rkind);
10608 if (!buf2) goto error;
10609 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 rkind = kind2;
10614 sbuf = _PyUnicode_AsKind(self, rkind);
10615 if (!sbuf) goto error;
10616 srelease = 1;
10617 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010618 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf1 = _PyUnicode_AsKind(str1, rkind);
10620 if (!buf1) goto error;
10621 release1 = 1;
10622 }
10623 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10624 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 PyErr_SetString(PyExc_OverflowError,
10627 "replace string is too long");
10628 goto error;
10629 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010630 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010631 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010632 _Py_INCREF_UNICODE_EMPTY();
10633 if (!unicode_empty)
10634 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010635 u = unicode_empty;
10636 goto done;
10637 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010638 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 PyErr_SetString(PyExc_OverflowError,
10640 "replace string is too long");
10641 goto error;
10642 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 u = PyUnicode_New(new_size, maxchar);
10644 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 assert(PyUnicode_KIND(u) == rkind);
10647 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 ires = i = 0;
10649 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 while (n-- > 0) {
10651 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010652 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010654 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010655 if (j == -1)
10656 break;
10657 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * ires,
10660 sbuf + rkind * i,
10661 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
10664 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010674 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
10676 sbuf + rkind * i,
10677 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 }
10679 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 /* interleave */
10681 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 if (--n <= 0)
10687 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010688 memcpy(res + rkind * ires,
10689 sbuf + rkind * i,
10690 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 ires++;
10692 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
10695 sbuf + rkind * i,
10696 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010698 }
10699
10700 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010701 unicode_adjust_maxchar(&u);
10702 if (u == NULL)
10703 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705
10706 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (srelease)
10708 PyMem_FREE(sbuf);
10709 if (release1)
10710 PyMem_FREE(buf1);
10711 if (release2)
10712 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010713 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010717 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 if (srelease)
10719 PyMem_FREE(sbuf);
10720 if (release1)
10721 PyMem_FREE(buf1);
10722 if (release2)
10723 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010724 return unicode_result_unchanged(self);
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 error:
10727 if (srelease && sbuf)
10728 PyMem_FREE(sbuf);
10729 if (release1 && buf1)
10730 PyMem_FREE(buf1);
10731 if (release2 && buf2)
10732 PyMem_FREE(buf2);
10733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734}
10735
10736/* --- Unicode Object Methods --------------------------------------------- */
10737
INADA Naoki3ae20562017-01-16 20:41:20 +090010738/*[clinic input]
10739str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
INADA Naoki3ae20562017-01-16 20:41:20 +090010741Return a version of the string where each word is titlecased.
10742
10743More specifically, words start with uppercased characters and all remaining
10744cased characters have lower case.
10745[clinic start generated code]*/
10746
10747static PyObject *
10748unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010749/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010751 if (PyUnicode_READY(self) == -1)
10752 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010753 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754}
10755
INADA Naoki3ae20562017-01-16 20:41:20 +090010756/*[clinic input]
10757str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758
INADA Naoki3ae20562017-01-16 20:41:20 +090010759Return a capitalized version of the string.
10760
10761More specifically, make the first character have upper case and the rest lower
10762case.
10763[clinic start generated code]*/
10764
10765static PyObject *
10766unicode_capitalize_impl(PyObject *self)
10767/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010769 if (PyUnicode_READY(self) == -1)
10770 return NULL;
10771 if (PyUnicode_GET_LENGTH(self) == 0)
10772 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010773 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774}
10775
INADA Naoki3ae20562017-01-16 20:41:20 +090010776/*[clinic input]
10777str.casefold as unicode_casefold
10778
10779Return a version of the string suitable for caseless comparisons.
10780[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010781
10782static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010783unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010784/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010785{
10786 if (PyUnicode_READY(self) == -1)
10787 return NULL;
10788 if (PyUnicode_IS_ASCII(self))
10789 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010790 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010791}
10792
10793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010794/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795
10796static int
10797convert_uc(PyObject *obj, void *addr)
10798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801 if (!PyUnicode_Check(obj)) {
10802 PyErr_Format(PyExc_TypeError,
10803 "The fill character must be a unicode character, "
10804 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 return 0;
10806 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010807 if (PyUnicode_READY(obj) < 0)
10808 return 0;
10809 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010810 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010811 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010815 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010816}
10817
INADA Naoki3ae20562017-01-16 20:41:20 +090010818/*[clinic input]
10819str.center as unicode_center
10820
10821 width: Py_ssize_t
10822 fillchar: Py_UCS4 = ' '
10823 /
10824
10825Return a centered string of length width.
10826
10827Padding is done using the specified fill character (default is a space).
10828[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
10830static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010831unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10832/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010834 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
Benjamin Petersonbac79492012-01-14 13:34:47 -050010836 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 return NULL;
10838
Victor Stinnerc4b49542011-12-11 22:44:26 +010010839 if (PyUnicode_GET_LENGTH(self) >= width)
10840 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Victor Stinnerc4b49542011-12-11 22:44:26 +010010842 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 left = marg / 2 + (marg & width & 1);
10844
Victor Stinner9310abb2011-10-05 00:59:23 +020010845 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846}
10847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848/* This function assumes that str1 and str2 are readied by the caller. */
10849
Marc-André Lemburge5034372000-08-08 08:04:29 +000010850static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010851unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010852{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010853#define COMPARE(TYPE1, TYPE2) \
10854 do { \
10855 TYPE1* p1 = (TYPE1 *)data1; \
10856 TYPE2* p2 = (TYPE2 *)data2; \
10857 TYPE1* end = p1 + len; \
10858 Py_UCS4 c1, c2; \
10859 for (; p1 != end; p1++, p2++) { \
10860 c1 = *p1; \
10861 c2 = *p2; \
10862 if (c1 != c2) \
10863 return (c1 < c2) ? -1 : 1; \
10864 } \
10865 } \
10866 while (0)
10867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 int kind1, kind2;
10869 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010870 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 kind1 = PyUnicode_KIND(str1);
10873 kind2 = PyUnicode_KIND(str2);
10874 data1 = PyUnicode_DATA(str1);
10875 data2 = PyUnicode_DATA(str2);
10876 len1 = PyUnicode_GET_LENGTH(str1);
10877 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010878 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010879
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010880 switch(kind1) {
10881 case PyUnicode_1BYTE_KIND:
10882 {
10883 switch(kind2) {
10884 case PyUnicode_1BYTE_KIND:
10885 {
10886 int cmp = memcmp(data1, data2, len);
10887 /* normalize result of memcmp() into the range [-1; 1] */
10888 if (cmp < 0)
10889 return -1;
10890 if (cmp > 0)
10891 return 1;
10892 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010893 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010894 case PyUnicode_2BYTE_KIND:
10895 COMPARE(Py_UCS1, Py_UCS2);
10896 break;
10897 case PyUnicode_4BYTE_KIND:
10898 COMPARE(Py_UCS1, Py_UCS4);
10899 break;
10900 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010901 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 }
10903 break;
10904 }
10905 case PyUnicode_2BYTE_KIND:
10906 {
10907 switch(kind2) {
10908 case PyUnicode_1BYTE_KIND:
10909 COMPARE(Py_UCS2, Py_UCS1);
10910 break;
10911 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010912 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010913 COMPARE(Py_UCS2, Py_UCS2);
10914 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010915 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010916 case PyUnicode_4BYTE_KIND:
10917 COMPARE(Py_UCS2, Py_UCS4);
10918 break;
10919 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010920 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010921 }
10922 break;
10923 }
10924 case PyUnicode_4BYTE_KIND:
10925 {
10926 switch(kind2) {
10927 case PyUnicode_1BYTE_KIND:
10928 COMPARE(Py_UCS4, Py_UCS1);
10929 break;
10930 case PyUnicode_2BYTE_KIND:
10931 COMPARE(Py_UCS4, Py_UCS2);
10932 break;
10933 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010934 {
10935#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10936 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10937 /* normalize result of wmemcmp() into the range [-1; 1] */
10938 if (cmp < 0)
10939 return -1;
10940 if (cmp > 0)
10941 return 1;
10942#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010943 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010944#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010945 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010946 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010947 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010948 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010949 }
10950 break;
10951 }
10952 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010953 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010954 }
10955
Victor Stinner770e19e2012-10-04 22:59:45 +020010956 if (len1 == len2)
10957 return 0;
10958 if (len1 < len2)
10959 return -1;
10960 else
10961 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010962
10963#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010964}
10965
Benjamin Peterson621b4302016-09-09 13:54:34 -070010966static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010967unicode_compare_eq(PyObject *str1, PyObject *str2)
10968{
10969 int kind;
10970 void *data1, *data2;
10971 Py_ssize_t len;
10972 int cmp;
10973
Victor Stinnere5567ad2012-10-23 02:48:49 +020010974 len = PyUnicode_GET_LENGTH(str1);
10975 if (PyUnicode_GET_LENGTH(str2) != len)
10976 return 0;
10977 kind = PyUnicode_KIND(str1);
10978 if (PyUnicode_KIND(str2) != kind)
10979 return 0;
10980 data1 = PyUnicode_DATA(str1);
10981 data2 = PyUnicode_DATA(str2);
10982
10983 cmp = memcmp(data1, data2, len * kind);
10984 return (cmp == 0);
10985}
10986
10987
Alexander Belopolsky40018472011-02-26 01:02:56 +000010988int
10989PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10992 if (PyUnicode_READY(left) == -1 ||
10993 PyUnicode_READY(right) == -1)
10994 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010995
10996 /* a string is equal to itself */
10997 if (left == right)
10998 return 0;
10999
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011000 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011002 PyErr_Format(PyExc_TypeError,
11003 "Can't compare %.100s and %.100s",
11004 left->ob_type->tp_name,
11005 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 return -1;
11007}
11008
Martin v. Löwis5b222132007-06-10 09:51:05 +000011009int
11010PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 Py_ssize_t i;
11013 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011015 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016
Victor Stinner910337b2011-10-03 03:20:16 +020011017 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011018 if (!PyUnicode_IS_READY(uni)) {
11019 const wchar_t *ws = _PyUnicode_WSTR(uni);
11020 /* Compare Unicode string and source character set string */
11021 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11022 if (chr != ustr[i])
11023 return (chr < ustr[i]) ? -1 : 1;
11024 }
11025 /* This check keeps Python strings that end in '\0' from comparing equal
11026 to C strings identical up to that point. */
11027 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11028 return 1; /* uni is longer */
11029 if (ustr[i])
11030 return -1; /* str is longer */
11031 return 0;
11032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011034 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011035 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011036 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011037 size_t len, len2 = strlen(str);
11038 int cmp;
11039
11040 len = Py_MIN(len1, len2);
11041 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011042 if (cmp != 0) {
11043 if (cmp < 0)
11044 return -1;
11045 else
11046 return 1;
11047 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011048 if (len1 > len2)
11049 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011050 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011051 return -1; /* str is longer */
11052 return 0;
11053 }
11054 else {
11055 void *data = PyUnicode_DATA(uni);
11056 /* Compare Unicode string and source character set string */
11057 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011058 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011059 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11060 /* This check keeps Python strings that end in '\0' from comparing equal
11061 to C strings identical up to that point. */
11062 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11063 return 1; /* uni is longer */
11064 if (str[i])
11065 return -1; /* str is longer */
11066 return 0;
11067 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011068}
11069
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011070static int
11071non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11072{
11073 size_t i, len;
11074 const wchar_t *p;
11075 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11076 if (strlen(str) != len)
11077 return 0;
11078 p = _PyUnicode_WSTR(unicode);
11079 assert(p);
11080 for (i = 0; i < len; i++) {
11081 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011082 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011083 return 0;
11084 }
11085 return 1;
11086}
11087
11088int
11089_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11090{
11091 size_t len;
11092 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011093 assert(str);
11094#ifndef NDEBUG
11095 for (const char *p = str; *p; p++) {
11096 assert((unsigned char)*p < 128);
11097 }
11098#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011099 if (PyUnicode_READY(unicode) == -1) {
11100 /* Memory error or bad data */
11101 PyErr_Clear();
11102 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11103 }
11104 if (!PyUnicode_IS_ASCII(unicode))
11105 return 0;
11106 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11107 return strlen(str) == len &&
11108 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11109}
11110
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011111int
11112_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11113{
11114 PyObject *right_uni;
11115 Py_hash_t hash;
11116
11117 assert(_PyUnicode_CHECK(left));
11118 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011119#ifndef NDEBUG
11120 for (const char *p = right->string; *p; p++) {
11121 assert((unsigned char)*p < 128);
11122 }
11123#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011124
11125 if (PyUnicode_READY(left) == -1) {
11126 /* memory error or bad data */
11127 PyErr_Clear();
11128 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11129 }
11130
11131 if (!PyUnicode_IS_ASCII(left))
11132 return 0;
11133
11134 right_uni = _PyUnicode_FromId(right); /* borrowed */
11135 if (right_uni == NULL) {
11136 /* memory error or bad data */
11137 PyErr_Clear();
11138 return _PyUnicode_EqualToASCIIString(left, right->string);
11139 }
11140
11141 if (left == right_uni)
11142 return 1;
11143
11144 if (PyUnicode_CHECK_INTERNED(left))
11145 return 0;
11146
11147 assert(_PyUnicode_HASH(right_uni) != 1);
11148 hash = _PyUnicode_HASH(left);
11149 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11150 return 0;
11151
11152 return unicode_compare_eq(left, right_uni);
11153}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011154
Alexander Belopolsky40018472011-02-26 01:02:56 +000011155PyObject *
11156PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011157{
11158 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011159
Victor Stinnere5567ad2012-10-23 02:48:49 +020011160 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11161 Py_RETURN_NOTIMPLEMENTED;
11162
11163 if (PyUnicode_READY(left) == -1 ||
11164 PyUnicode_READY(right) == -1)
11165 return NULL;
11166
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011167 if (left == right) {
11168 switch (op) {
11169 case Py_EQ:
11170 case Py_LE:
11171 case Py_GE:
11172 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011173 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011174 case Py_NE:
11175 case Py_LT:
11176 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011177 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011178 default:
11179 PyErr_BadArgument();
11180 return NULL;
11181 }
11182 }
11183 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011184 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011185 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011186 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011187 }
11188 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011189 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011190 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011191 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011192}
11193
Alexander Belopolsky40018472011-02-26 01:02:56 +000011194int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011195_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11196{
11197 return unicode_eq(aa, bb);
11198}
11199
11200int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011201PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011202{
Victor Stinner77282cb2013-04-14 19:22:47 +020011203 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 void *buf1, *buf2;
11205 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011206 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011207
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 PyErr_Format(PyExc_TypeError,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011210 "'in <string>' requires string as left operand, not %.100s",
11211 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011212 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011213 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011214 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011215 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216 if (ensure_unicode(str) < 0)
11217 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 kind2 = PyUnicode_KIND(substr);
11221 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011222 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011224 len2 = PyUnicode_GET_LENGTH(substr);
11225 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011226 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011227 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011228 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011229 if (len2 == 1) {
11230 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11231 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011232 return result;
11233 }
11234 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 buf2 = _PyUnicode_AsKind(substr, kind1);
11236 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011237 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239
Victor Stinner77282cb2013-04-14 19:22:47 +020011240 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 case PyUnicode_1BYTE_KIND:
11242 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11243 break;
11244 case PyUnicode_2BYTE_KIND:
11245 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11246 break;
11247 case PyUnicode_4BYTE_KIND:
11248 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11249 break;
11250 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011251 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011253
Victor Stinner77282cb2013-04-14 19:22:47 +020011254 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 PyMem_Free(buf2);
11256
Guido van Rossum403d68b2000-03-13 15:55:09 +000011257 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011258}
11259
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260/* Concat to string or Unicode object giving a new Unicode object. */
11261
Alexander Belopolsky40018472011-02-26 01:02:56 +000011262PyObject *
11263PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011265 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011266 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011267 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011269 if (ensure_unicode(left) < 0)
11270 return NULL;
11271
11272 if (!PyUnicode_Check(right)) {
11273 PyErr_Format(PyExc_TypeError,
11274 "can only concatenate str (not \"%.200s\") to str",
11275 right->ob_type->tp_name);
11276 return NULL;
11277 }
11278 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
11281 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011282 if (left == unicode_empty)
11283 return PyUnicode_FromObject(right);
11284 if (right == unicode_empty)
11285 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011287 left_len = PyUnicode_GET_LENGTH(left);
11288 right_len = PyUnicode_GET_LENGTH(right);
11289 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011290 PyErr_SetString(PyExc_OverflowError,
11291 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011292 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011293 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011295
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11297 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011298 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 result = PyUnicode_New(new_len, maxchar);
11302 if (result == NULL)
11303 return NULL;
11304 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11305 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11306 assert(_PyUnicode_CheckConsistency(result, 1));
11307 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308}
11309
Walter Dörwald1ab83302007-05-18 17:15:44 +000011310void
Victor Stinner23e56682011-10-03 03:54:37 +020011311PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011312{
Victor Stinner23e56682011-10-03 03:54:37 +020011313 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011314 Py_UCS4 maxchar, maxchar2;
11315 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011316
11317 if (p_left == NULL) {
11318 if (!PyErr_Occurred())
11319 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011320 return;
11321 }
Victor Stinner23e56682011-10-03 03:54:37 +020011322 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011323 if (right == NULL || left == NULL
11324 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011325 if (!PyErr_Occurred())
11326 PyErr_BadInternalCall();
11327 goto error;
11328 }
11329
Benjamin Petersonbac79492012-01-14 13:34:47 -050011330 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011331 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011332 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011333 goto error;
11334
Victor Stinner488fa492011-12-12 00:01:39 +010011335 /* Shortcuts */
11336 if (left == unicode_empty) {
11337 Py_DECREF(left);
11338 Py_INCREF(right);
11339 *p_left = right;
11340 return;
11341 }
11342 if (right == unicode_empty)
11343 return;
11344
11345 left_len = PyUnicode_GET_LENGTH(left);
11346 right_len = PyUnicode_GET_LENGTH(right);
11347 if (left_len > PY_SSIZE_T_MAX - right_len) {
11348 PyErr_SetString(PyExc_OverflowError,
11349 "strings are too large to concat");
11350 goto error;
11351 }
11352 new_len = left_len + right_len;
11353
11354 if (unicode_modifiable(left)
11355 && PyUnicode_CheckExact(right)
11356 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011357 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11358 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011359 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011360 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011361 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11362 {
11363 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011364 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011365 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011366
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011367 /* copy 'right' into the newly allocated area of 'left' */
11368 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011369 }
Victor Stinner488fa492011-12-12 00:01:39 +010011370 else {
11371 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11372 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011373 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011374
Victor Stinner488fa492011-12-12 00:01:39 +010011375 /* Concat the two Unicode strings */
11376 res = PyUnicode_New(new_len, maxchar);
11377 if (res == NULL)
11378 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011379 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11380 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011381 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011382 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011383 }
11384 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011385 return;
11386
11387error:
Victor Stinner488fa492011-12-12 00:01:39 +010011388 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011389}
11390
11391void
11392PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11393{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011394 PyUnicode_Append(pleft, right);
11395 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011396}
11397
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011398/*
11399Wraps stringlib_parse_args_finds() and additionally ensures that the
11400first argument is a unicode object.
11401*/
11402
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011403static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011404parse_args_finds_unicode(const char * function_name, PyObject *args,
11405 PyObject **substring,
11406 Py_ssize_t *start, Py_ssize_t *end)
11407{
11408 if(stringlib_parse_args_finds(function_name, args, substring,
11409 start, end)) {
11410 if (ensure_unicode(*substring) < 0)
11411 return 0;
11412 return 1;
11413 }
11414 return 0;
11415}
11416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011420Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011421string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
11424static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011425unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011427 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011428 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011429 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011431 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 void *buf1, *buf2;
11433 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011435 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 kind1 = PyUnicode_KIND(self);
11439 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011440 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011441 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 len1 = PyUnicode_GET_LENGTH(self);
11444 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011446 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011447 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011448
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011449 buf1 = PyUnicode_DATA(self);
11450 buf2 = PyUnicode_DATA(substring);
11451 if (kind2 != kind1) {
11452 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011454 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011455 }
11456 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 case PyUnicode_1BYTE_KIND:
11458 iresult = ucs1lib_count(
11459 ((Py_UCS1*)buf1) + start, end - start,
11460 buf2, len2, PY_SSIZE_T_MAX
11461 );
11462 break;
11463 case PyUnicode_2BYTE_KIND:
11464 iresult = ucs2lib_count(
11465 ((Py_UCS2*)buf1) + start, end - start,
11466 buf2, len2, PY_SSIZE_T_MAX
11467 );
11468 break;
11469 case PyUnicode_4BYTE_KIND:
11470 iresult = ucs4lib_count(
11471 ((Py_UCS4*)buf1) + start, end - start,
11472 buf2, len2, PY_SSIZE_T_MAX
11473 );
11474 break;
11475 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011476 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 }
11478
11479 result = PyLong_FromSsize_t(iresult);
11480
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011481 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 return result;
11485}
11486
INADA Naoki3ae20562017-01-16 20:41:20 +090011487/*[clinic input]
11488str.encode as unicode_encode
11489
11490 encoding: str(c_default="NULL") = 'utf-8'
11491 The encoding in which to encode the string.
11492 errors: str(c_default="NULL") = 'strict'
11493 The error handling scheme to use for encoding errors.
11494 The default is 'strict' meaning that encoding errors raise a
11495 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11496 'xmlcharrefreplace' as well as any other name registered with
11497 codecs.register_error that can handle UnicodeEncodeErrors.
11498
11499Encode the string using the codec registered for encoding.
11500[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011503unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011504/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011506 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011507}
11508
INADA Naoki3ae20562017-01-16 20:41:20 +090011509/*[clinic input]
11510str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
INADA Naoki3ae20562017-01-16 20:41:20 +090011512 tabsize: int = 8
11513
11514Return a copy where all tab characters are expanded using spaces.
11515
11516If tabsize is not given, a tab size of 8 characters is assumed.
11517[clinic start generated code]*/
11518
11519static PyObject *
11520unicode_expandtabs_impl(PyObject *self, int tabsize)
11521/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011523 Py_ssize_t i, j, line_pos, src_len, incr;
11524 Py_UCS4 ch;
11525 PyObject *u;
11526 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011527 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011528 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
Antoine Pitrou22425222011-10-04 19:10:51 +020011530 if (PyUnicode_READY(self) == -1)
11531 return NULL;
11532
Thomas Wouters7e474022000-07-16 12:04:32 +000011533 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011534 src_len = PyUnicode_GET_LENGTH(self);
11535 i = j = line_pos = 0;
11536 kind = PyUnicode_KIND(self);
11537 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011538 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011539 for (; i < src_len; i++) {
11540 ch = PyUnicode_READ(kind, src_data, i);
11541 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011542 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011544 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 goto overflow;
11547 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011549 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 goto overflow;
11554 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 if (ch == '\n' || ch == '\r')
11557 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011559 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011560 if (!found)
11561 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011562
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011564 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 if (!u)
11566 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011567 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568
Antoine Pitroue71d5742011-10-04 15:55:09 +020011569 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 for (; i < src_len; i++) {
11572 ch = PyUnicode_READ(kind, src_data, i);
11573 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011575 incr = tabsize - (line_pos % tabsize);
11576 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011577 FILL(kind, dest_data, ' ', j, incr);
11578 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011580 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 line_pos++;
11583 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011584 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011585 if (ch == '\n' || ch == '\r')
11586 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011588 }
11589 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011590 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011591
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011593 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595}
11596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011597PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599\n\
11600Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011601such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602arguments start and end are interpreted as in slice notation.\n\
11603\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011609 /* initialize variables to prevent gcc warning */
11610 PyObject *substring = NULL;
11611 Py_ssize_t start = 0;
11612 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011613 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011615 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011618 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011621 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 if (result == -2)
11624 return NULL;
11625
Christian Heimes217cfd12007-12-02 14:31:20 +000011626 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
11629static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011630unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011632 void *data;
11633 enum PyUnicode_Kind kind;
11634 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011635
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011636 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011637 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011639 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011640 if (PyUnicode_READY(self) == -1) {
11641 return NULL;
11642 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011643 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11644 PyErr_SetString(PyExc_IndexError, "string index out of range");
11645 return NULL;
11646 }
11647 kind = PyUnicode_KIND(self);
11648 data = PyUnicode_DATA(self);
11649 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011650 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651}
11652
Guido van Rossumc2504932007-09-18 19:42:40 +000011653/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011654 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011655static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011656unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657{
Guido van Rossumc2504932007-09-18 19:42:40 +000011658 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011659 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011660
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011661#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011662 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011663#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 if (_PyUnicode_HASH(self) != -1)
11665 return _PyUnicode_HASH(self);
11666 if (PyUnicode_READY(self) == -1)
11667 return -1;
11668 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011669 /*
11670 We make the hash of the empty string be 0, rather than using
11671 (prefix ^ suffix), since this slightly obfuscates the hash secret
11672 */
11673 if (len == 0) {
11674 _PyUnicode_HASH(self) = 0;
11675 return 0;
11676 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011677 x = _Py_HashBytes(PyUnicode_DATA(self),
11678 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011680 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681}
11682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011686Return the lowest index in S where substring sub is found, \n\
11687such that sub is contained within S[start:end]. Optional\n\
11688arguments start and end are interpreted as in slice notation.\n\
11689\n\
11690Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
11692static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011695 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011696 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011697 PyObject *substring = NULL;
11698 Py_ssize_t start = 0;
11699 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011701 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011704 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011707 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (result == -2)
11710 return NULL;
11711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 if (result < 0) {
11713 PyErr_SetString(PyExc_ValueError, "substring not found");
11714 return NULL;
11715 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011716
Christian Heimes217cfd12007-12-02 14:31:20 +000011717 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718}
11719
INADA Naoki3ae20562017-01-16 20:41:20 +090011720/*[clinic input]
11721str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722
INADA Naoki3ae20562017-01-16 20:41:20 +090011723Return True if the string is a lowercase string, False otherwise.
11724
11725A string is lowercase if all cased characters in the string are lowercase and
11726there is at least one cased character in the string.
11727[clinic start generated code]*/
11728
11729static PyObject *
11730unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011731/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 Py_ssize_t i, length;
11734 int kind;
11735 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 int cased;
11737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740 length = PyUnicode_GET_LENGTH(self);
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
11743
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 1)
11746 return PyBool_FromLong(
11747 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011751 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 for (i = 0; i < length; i++) {
11755 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011756
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011758 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 else if (!cased && Py_UNICODE_ISLOWER(ch))
11760 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011762 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763}
11764
INADA Naoki3ae20562017-01-16 20:41:20 +090011765/*[clinic input]
11766str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
INADA Naoki3ae20562017-01-16 20:41:20 +090011768Return True if the string is an uppercase string, False otherwise.
11769
11770A string is uppercase if all cased characters in the string are uppercase and
11771there is at least one cased character in the string.
11772[clinic start generated code]*/
11773
11774static PyObject *
11775unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011776/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 Py_ssize_t i, length;
11779 int kind;
11780 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 int cased;
11782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (PyUnicode_READY(self) == -1)
11784 return NULL;
11785 length = PyUnicode_GET_LENGTH(self);
11786 kind = PyUnicode_KIND(self);
11787 data = PyUnicode_DATA(self);
11788
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (length == 1)
11791 return PyBool_FromLong(
11792 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011794 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011796 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011797
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 for (i = 0; i < length; i++) {
11800 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011801
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011803 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 else if (!cased && Py_UNICODE_ISUPPER(ch))
11805 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011807 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808}
11809
INADA Naoki3ae20562017-01-16 20:41:20 +090011810/*[clinic input]
11811str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
INADA Naoki3ae20562017-01-16 20:41:20 +090011813Return True if the string is a title-cased string, False otherwise.
11814
11815In a title-cased string, upper- and title-case characters may only
11816follow uncased characters and lowercase characters only cased ones.
11817[clinic start generated code]*/
11818
11819static PyObject *
11820unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011821/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 Py_ssize_t i, length;
11824 int kind;
11825 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 int cased, previous_is_cased;
11827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 if (PyUnicode_READY(self) == -1)
11829 return NULL;
11830 length = PyUnicode_GET_LENGTH(self);
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
11833
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 if (length == 1) {
11836 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11837 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11838 (Py_UNICODE_ISUPPER(ch) != 0));
11839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011841 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011843 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 cased = 0;
11846 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 for (i = 0; i < length; i++) {
11848 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011849
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11851 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011852 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 previous_is_cased = 1;
11854 cased = 1;
11855 }
11856 else if (Py_UNICODE_ISLOWER(ch)) {
11857 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011858 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 previous_is_cased = 1;
11860 cased = 1;
11861 }
11862 else
11863 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011865 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
INADA Naoki3ae20562017-01-16 20:41:20 +090011868/*[clinic input]
11869str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
INADA Naoki3ae20562017-01-16 20:41:20 +090011871Return True if the string is a whitespace string, False otherwise.
11872
11873A string is whitespace if all characters in the string are whitespace and there
11874is at least one character in the string.
11875[clinic start generated code]*/
11876
11877static PyObject *
11878unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011879/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 Py_ssize_t i, length;
11882 int kind;
11883 void *data;
11884
11885 if (PyUnicode_READY(self) == -1)
11886 return NULL;
11887 length = PyUnicode_GET_LENGTH(self);
11888 kind = PyUnicode_KIND(self);
11889 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (length == 1)
11893 return PyBool_FromLong(
11894 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011896 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011898 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 for (i = 0; i < length; i++) {
11901 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011902 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011903 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011905 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
INADA Naoki3ae20562017-01-16 20:41:20 +090011908/*[clinic input]
11909str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011910
INADA Naoki3ae20562017-01-16 20:41:20 +090011911Return True if the string is an alphabetic string, False otherwise.
11912
11913A string is alphabetic if all characters in the string are alphabetic and there
11914is at least one character in the string.
11915[clinic start generated code]*/
11916
11917static PyObject *
11918unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011919/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 Py_ssize_t i, length;
11922 int kind;
11923 void *data;
11924
11925 if (PyUnicode_READY(self) == -1)
11926 return NULL;
11927 length = PyUnicode_GET_LENGTH(self);
11928 kind = PyUnicode_KIND(self);
11929 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011930
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011931 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 if (length == 1)
11933 return PyBool_FromLong(
11934 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935
11936 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 for (i = 0; i < length; i++) {
11941 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011942 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011943 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011944 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945}
11946
INADA Naoki3ae20562017-01-16 20:41:20 +090011947/*[clinic input]
11948str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949
INADA Naoki3ae20562017-01-16 20:41:20 +090011950Return True if the string is an alpha-numeric string, False otherwise.
11951
11952A string is alpha-numeric if all characters in the string are alpha-numeric and
11953there is at least one character in the string.
11954[clinic start generated code]*/
11955
11956static PyObject *
11957unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011958/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 int kind;
11961 void *data;
11962 Py_ssize_t len, i;
11963
11964 if (PyUnicode_READY(self) == -1)
11965 return NULL;
11966
11967 kind = PyUnicode_KIND(self);
11968 data = PyUnicode_DATA(self);
11969 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (len == 1) {
11973 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11974 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11975 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976
11977 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011979 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 for (i = 0; i < len; i++) {
11982 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011983 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011984 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011986 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987}
11988
INADA Naoki3ae20562017-01-16 20:41:20 +090011989/*[clinic input]
11990str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
INADA Naoki3ae20562017-01-16 20:41:20 +090011992Return True if the string is a decimal string, False otherwise.
11993
11994A string is a decimal string if all characters in the string are decimal and
11995there is at least one character in the string.
11996[clinic start generated code]*/
11997
11998static PyObject *
11999unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012000/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 Py_ssize_t i, length;
12003 int kind;
12004 void *data;
12005
12006 if (PyUnicode_READY(self) == -1)
12007 return NULL;
12008 length = PyUnicode_GET_LENGTH(self);
12009 kind = PyUnicode_KIND(self);
12010 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (length == 1)
12014 return PyBool_FromLong(
12015 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012017 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012019 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 for (i = 0; i < length; i++) {
12022 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012023 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012025 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026}
12027
INADA Naoki3ae20562017-01-16 20:41:20 +090012028/*[clinic input]
12029str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
INADA Naoki3ae20562017-01-16 20:41:20 +090012031Return True if the string is a digit string, False otherwise.
12032
12033A string is a digit string if all characters in the string are digits and there
12034is at least one character in the string.
12035[clinic start generated code]*/
12036
12037static PyObject *
12038unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012039/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 Py_ssize_t i, length;
12042 int kind;
12043 void *data;
12044
12045 if (PyUnicode_READY(self) == -1)
12046 return NULL;
12047 length = PyUnicode_GET_LENGTH(self);
12048 kind = PyUnicode_KIND(self);
12049 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 if (length == 1) {
12053 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12054 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012057 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012059 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 for (i = 0; i < length; i++) {
12062 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012065 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
INADA Naoki3ae20562017-01-16 20:41:20 +090012068/*[clinic input]
12069str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
INADA Naoki3ae20562017-01-16 20:41:20 +090012071Return True if the string is a numeric string, False otherwise.
12072
12073A string is numeric if all characters in the string are numeric and there is at
12074least one character in the string.
12075[clinic start generated code]*/
12076
12077static PyObject *
12078unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012079/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 Py_ssize_t i, length;
12082 int kind;
12083 void *data;
12084
12085 if (PyUnicode_READY(self) == -1)
12086 return NULL;
12087 length = PyUnicode_GET_LENGTH(self);
12088 kind = PyUnicode_KIND(self);
12089 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 if (length == 1)
12093 return PyBool_FromLong(
12094 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012096 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012098 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 for (i = 0; i < length; i++) {
12101 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012102 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012104 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
Martin v. Löwis47383402007-08-15 07:32:56 +000012107int
12108PyUnicode_IsIdentifier(PyObject *self)
12109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 int kind;
12111 void *data;
12112 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012113 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (PyUnicode_READY(self) == -1) {
12116 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 }
12119
12120 /* Special case for empty strings */
12121 if (PyUnicode_GET_LENGTH(self) == 0)
12122 return 0;
12123 kind = PyUnicode_KIND(self);
12124 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012125
12126 /* PEP 3131 says that the first character must be in
12127 XID_Start and subsequent characters in XID_Continue,
12128 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012129 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012130 letters, digits, underscore). However, given the current
12131 definition of XID_Start and XID_Continue, it is sufficient
12132 to check just for these, except that _ must be allowed
12133 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012135 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012136 return 0;
12137
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012138 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012141 return 1;
12142}
12143
INADA Naoki3ae20562017-01-16 20:41:20 +090012144/*[clinic input]
12145str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012146
INADA Naoki3ae20562017-01-16 20:41:20 +090012147Return True if the string is a valid Python identifier, False otherwise.
12148
12149Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12150"class".
12151[clinic start generated code]*/
12152
12153static PyObject *
12154unicode_isidentifier_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012155/*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012156{
12157 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is printable, False otherwise.
12164
12165A string is printable if all of its characters are considered printable in
12166repr() or if it is empty.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_ssize_t i, length;
12174 int kind;
12175 void *data;
12176
12177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179 length = PyUnicode_GET_LENGTH(self);
12180 kind = PyUnicode_KIND(self);
12181 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012182
12183 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (length == 1)
12185 return PyBool_FromLong(
12186 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 for (i = 0; i < length; i++) {
12189 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012190 Py_RETURN_FALSE;
12191 }
12192 }
12193 Py_RETURN_TRUE;
12194}
12195
INADA Naoki3ae20562017-01-16 20:41:20 +090012196/*[clinic input]
12197str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199 iterable: object
12200 /
12201
12202Concatenate any number of strings.
12203
Martin Panter91a88662017-01-24 00:30:06 +000012204The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012205The result is returned as a new string.
12206
12207Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12208[clinic start generated code]*/
12209
12210static PyObject *
12211unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012212/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213{
INADA Naoki3ae20562017-01-16 20:41:20 +090012214 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215}
12216
Martin v. Löwis18e16552006-02-15 17:27:45 +000012217static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012218unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 if (PyUnicode_READY(self) == -1)
12221 return -1;
12222 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223}
12224
INADA Naoki3ae20562017-01-16 20:41:20 +090012225/*[clinic input]
12226str.ljust as unicode_ljust
12227
12228 width: Py_ssize_t
12229 fillchar: Py_UCS4 = ' '
12230 /
12231
12232Return a left-justified string of length width.
12233
12234Padding is done using the specified fill character (default is a space).
12235[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
12237static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012238unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12239/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012241 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243
Victor Stinnerc4b49542011-12-11 22:44:26 +010012244 if (PyUnicode_GET_LENGTH(self) >= width)
12245 return unicode_result_unchanged(self);
12246
12247 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
INADA Naoki3ae20562017-01-16 20:41:20 +090012250/*[clinic input]
12251str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
INADA Naoki3ae20562017-01-16 20:41:20 +090012253Return a copy of the string converted to lowercase.
12254[clinic start generated code]*/
12255
12256static PyObject *
12257unicode_lower_impl(PyObject *self)
12258/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012260 if (PyUnicode_READY(self) == -1)
12261 return NULL;
12262 if (PyUnicode_IS_ASCII(self))
12263 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012264 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012267#define LEFTSTRIP 0
12268#define RIGHTSTRIP 1
12269#define BOTHSTRIP 2
12270
12271/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012272static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012273
INADA Naoki3ae20562017-01-16 20:41:20 +090012274#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012275
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012276/* externally visible for str.strip(unicode) */
12277PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012278_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 void *data;
12281 int kind;
12282 Py_ssize_t i, j, len;
12283 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012284 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12287 return NULL;
12288
12289 kind = PyUnicode_KIND(self);
12290 data = PyUnicode_DATA(self);
12291 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012292 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12294 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012295 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296
Benjamin Peterson14339b62009-01-31 16:36:08 +000012297 i = 0;
12298 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012299 while (i < len) {
12300 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12301 if (!BLOOM(sepmask, ch))
12302 break;
12303 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12304 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 i++;
12306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012308
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 j = len;
12310 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012311 j--;
12312 while (j >= i) {
12313 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12314 if (!BLOOM(sepmask, ch))
12315 break;
12316 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12317 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012319 }
12320
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323
Victor Stinner7931d9a2011-11-04 00:22:48 +010012324 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325}
12326
12327PyObject*
12328PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12329{
12330 unsigned char *data;
12331 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012332 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333
Victor Stinnerde636f32011-10-01 03:55:54 +020012334 if (PyUnicode_READY(self) == -1)
12335 return NULL;
12336
Victor Stinner684d5fd2012-05-03 02:32:34 +020012337 length = PyUnicode_GET_LENGTH(self);
12338 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012339
Victor Stinner684d5fd2012-05-03 02:32:34 +020012340 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012341 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342
Victor Stinnerde636f32011-10-01 03:55:54 +020012343 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012344 PyErr_SetString(PyExc_IndexError, "string index out of range");
12345 return NULL;
12346 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012347 if (start >= length || end < start)
12348 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012349
Victor Stinner684d5fd2012-05-03 02:32:34 +020012350 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012351 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012352 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012353 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012354 }
12355 else {
12356 kind = PyUnicode_KIND(self);
12357 data = PyUnicode_1BYTE_DATA(self);
12358 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012359 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012360 length);
12361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
12364static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012365do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 Py_ssize_t len, i, j;
12368
12369 if (PyUnicode_READY(self) == -1)
12370 return NULL;
12371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012373
Victor Stinnercc7af722013-04-09 22:39:24 +020012374 if (PyUnicode_IS_ASCII(self)) {
12375 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12376
12377 i = 0;
12378 if (striptype != RIGHTSTRIP) {
12379 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012380 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012381 if (!_Py_ascii_whitespace[ch])
12382 break;
12383 i++;
12384 }
12385 }
12386
12387 j = len;
12388 if (striptype != LEFTSTRIP) {
12389 j--;
12390 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012391 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012392 if (!_Py_ascii_whitespace[ch])
12393 break;
12394 j--;
12395 }
12396 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 }
12398 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012399 else {
12400 int kind = PyUnicode_KIND(self);
12401 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012402
Victor Stinnercc7af722013-04-09 22:39:24 +020012403 i = 0;
12404 if (striptype != RIGHTSTRIP) {
12405 while (i < len) {
12406 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12407 if (!Py_UNICODE_ISSPACE(ch))
12408 break;
12409 i++;
12410 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012411 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012412
12413 j = len;
12414 if (striptype != LEFTSTRIP) {
12415 j--;
12416 while (j >= i) {
12417 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12418 if (!Py_UNICODE_ISSPACE(ch))
12419 break;
12420 j--;
12421 }
12422 j++;
12423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012425
Victor Stinner7931d9a2011-11-04 00:22:48 +010012426 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427}
12428
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012429
12430static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012431do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012432{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012433 if (sep != NULL && sep != Py_None) {
12434 if (PyUnicode_Check(sep))
12435 return _PyUnicode_XStrip(self, striptype, sep);
12436 else {
12437 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 "%s arg must be None or str",
12439 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012440 return NULL;
12441 }
12442 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012445}
12446
12447
INADA Naoki3ae20562017-01-16 20:41:20 +090012448/*[clinic input]
12449str.strip as unicode_strip
12450
12451 chars: object = None
12452 /
12453
Victor Stinner0c4a8282017-01-17 02:21:47 +010012454Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012455
12456If chars is given and not None, remove characters in chars instead.
12457[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458
12459static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012460unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012461/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462{
INADA Naoki3ae20562017-01-16 20:41:20 +090012463 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464}
12465
12466
INADA Naoki3ae20562017-01-16 20:41:20 +090012467/*[clinic input]
12468str.lstrip as unicode_lstrip
12469
12470 chars: object = NULL
12471 /
12472
12473Return a copy of the string with leading whitespace removed.
12474
12475If chars is given and not None, remove characters in chars instead.
12476[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477
12478static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012479unicode_lstrip_impl(PyObject *self, PyObject *chars)
12480/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481{
INADA Naoki3ae20562017-01-16 20:41:20 +090012482 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483}
12484
12485
INADA Naoki3ae20562017-01-16 20:41:20 +090012486/*[clinic input]
12487str.rstrip as unicode_rstrip
12488
12489 chars: object = NULL
12490 /
12491
12492Return a copy of the string with trailing whitespace removed.
12493
12494If chars is given and not None, remove characters in chars instead.
12495[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496
12497static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012498unicode_rstrip_impl(PyObject *self, PyObject *chars)
12499/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500{
INADA Naoki3ae20562017-01-16 20:41:20 +090012501 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502}
12503
12504
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012506unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012508 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510
Serhiy Storchaka05997252013-01-26 12:14:02 +020012511 if (len < 1)
12512 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
Victor Stinnerc4b49542011-12-11 22:44:26 +010012514 /* no repeat, return original string */
12515 if (len == 1)
12516 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012517
Benjamin Petersonbac79492012-01-14 13:34:47 -050012518 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 return NULL;
12520
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012521 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012522 PyErr_SetString(PyExc_OverflowError,
12523 "repeated string is too long");
12524 return NULL;
12525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012527
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 if (!u)
12530 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012531 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 if (PyUnicode_GET_LENGTH(str) == 1) {
12534 const int kind = PyUnicode_KIND(str);
12535 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012536 if (kind == PyUnicode_1BYTE_KIND) {
12537 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012538 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012539 }
12540 else if (kind == PyUnicode_2BYTE_KIND) {
12541 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012542 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012543 ucs2[n] = fill_char;
12544 } else {
12545 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12546 assert(kind == PyUnicode_4BYTE_KIND);
12547 for (n = 0; n < len; ++n)
12548 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 }
12551 else {
12552 /* number of characters copied this far */
12553 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012554 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012556 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012560 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012561 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 }
12564
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012565 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012566 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567}
12568
Alexander Belopolsky40018472011-02-26 01:02:56 +000012569PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012570PyUnicode_Replace(PyObject *str,
12571 PyObject *substr,
12572 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012573 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012575 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12576 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012578 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579}
12580
INADA Naoki3ae20562017-01-16 20:41:20 +090012581/*[clinic input]
12582str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
INADA Naoki3ae20562017-01-16 20:41:20 +090012584 old: unicode
12585 new: unicode
12586 count: Py_ssize_t = -1
12587 Maximum number of occurrences to replace.
12588 -1 (the default value) means replace all occurrences.
12589 /
12590
12591Return a copy with all occurrences of substring old replaced by new.
12592
12593If the optional argument count is given, only the first count occurrences are
12594replaced.
12595[clinic start generated code]*/
12596
12597static PyObject *
12598unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12599 Py_ssize_t count)
12600/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012602 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012604 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605}
12606
Alexander Belopolsky40018472011-02-26 01:02:56 +000012607static PyObject *
12608unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012610 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 Py_ssize_t isize;
12612 Py_ssize_t osize, squote, dquote, i, o;
12613 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012614 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012618 return NULL;
12619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 isize = PyUnicode_GET_LENGTH(unicode);
12621 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012622
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 /* Compute length of output, quote characters, and
12624 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012625 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 max = 127;
12627 squote = dquote = 0;
12628 ikind = PyUnicode_KIND(unicode);
12629 for (i = 0; i < isize; i++) {
12630 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012631 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012633 case '\'': squote++; break;
12634 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012636 incr = 2;
12637 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 default:
12639 /* Fast-path ASCII */
12640 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012641 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012643 ;
12644 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012647 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 if (osize > PY_SSIZE_T_MAX - incr) {
12654 PyErr_SetString(PyExc_OverflowError,
12655 "string is too long to generate repr");
12656 return NULL;
12657 }
12658 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 }
12660
12661 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012662 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012664 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 if (dquote)
12666 /* Both squote and dquote present. Use squote,
12667 and escape them */
12668 osize += squote;
12669 else
12670 quote = '"';
12671 }
Victor Stinner55c08782013-04-14 18:45:39 +020012672 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673
12674 repr = PyUnicode_New(osize, max);
12675 if (repr == NULL)
12676 return NULL;
12677 okind = PyUnicode_KIND(repr);
12678 odata = PyUnicode_DATA(repr);
12679
12680 PyUnicode_WRITE(okind, odata, 0, quote);
12681 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012682 if (unchanged) {
12683 _PyUnicode_FastCopyCharacters(repr, 1,
12684 unicode, 0,
12685 isize);
12686 }
12687 else {
12688 for (i = 0, o = 1; i < isize; i++) {
12689 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690
Victor Stinner55c08782013-04-14 18:45:39 +020012691 /* Escape quotes and backslashes */
12692 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012693 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012695 continue;
12696 }
12697
12698 /* Map special whitespace to '\t', \n', '\r' */
12699 if (ch == '\t') {
12700 PyUnicode_WRITE(okind, odata, o++, '\\');
12701 PyUnicode_WRITE(okind, odata, o++, 't');
12702 }
12703 else if (ch == '\n') {
12704 PyUnicode_WRITE(okind, odata, o++, '\\');
12705 PyUnicode_WRITE(okind, odata, o++, 'n');
12706 }
12707 else if (ch == '\r') {
12708 PyUnicode_WRITE(okind, odata, o++, '\\');
12709 PyUnicode_WRITE(okind, odata, o++, 'r');
12710 }
12711
12712 /* Map non-printable US ASCII to '\xhh' */
12713 else if (ch < ' ' || ch == 0x7F) {
12714 PyUnicode_WRITE(okind, odata, o++, '\\');
12715 PyUnicode_WRITE(okind, odata, o++, 'x');
12716 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12717 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12718 }
12719
12720 /* Copy ASCII characters as-is */
12721 else if (ch < 0x7F) {
12722 PyUnicode_WRITE(okind, odata, o++, ch);
12723 }
12724
12725 /* Non-ASCII characters */
12726 else {
12727 /* Map Unicode whitespace and control characters
12728 (categories Z* and C* except ASCII space)
12729 */
12730 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 /* Map 8-bit characters to '\xhh' */
12733 if (ch <= 0xff) {
12734 PyUnicode_WRITE(okind, odata, o++, 'x');
12735 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12737 }
12738 /* Map 16-bit characters to '\uxxxx' */
12739 else if (ch <= 0xffff) {
12740 PyUnicode_WRITE(okind, odata, o++, 'u');
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12744 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12745 }
12746 /* Map 21-bit characters to '\U00xxxxxx' */
12747 else {
12748 PyUnicode_WRITE(okind, odata, o++, 'U');
12749 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12757 }
12758 }
12759 /* Copy characters as-is */
12760 else {
12761 PyUnicode_WRITE(okind, odata, o++, ch);
12762 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012763 }
12764 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012767 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769}
12770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012771PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773\n\
12774Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012775such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776arguments start and end are interpreted as in slice notation.\n\
12777\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012778Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
12780static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012783 /* initialize variables to prevent gcc warning */
12784 PyObject *substring = NULL;
12785 Py_ssize_t start = 0;
12786 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012789 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012792 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012795 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797 if (result == -2)
12798 return NULL;
12799
Christian Heimes217cfd12007-12-02 14:31:20 +000012800 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801}
12802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012803PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012806Return the highest index in S where substring sub is found,\n\
12807such that sub is contained within S[start:end]. Optional\n\
12808arguments start and end are interpreted as in slice notation.\n\
12809\n\
12810Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811
12812static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012815 /* initialize variables to prevent gcc warning */
12816 PyObject *substring = NULL;
12817 Py_ssize_t start = 0;
12818 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012821 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012824 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012827 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 if (result == -2)
12830 return NULL;
12831
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832 if (result < 0) {
12833 PyErr_SetString(PyExc_ValueError, "substring not found");
12834 return NULL;
12835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012836
Christian Heimes217cfd12007-12-02 14:31:20 +000012837 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838}
12839
INADA Naoki3ae20562017-01-16 20:41:20 +090012840/*[clinic input]
12841str.rjust as unicode_rjust
12842
12843 width: Py_ssize_t
12844 fillchar: Py_UCS4 = ' '
12845 /
12846
12847Return a right-justified string of length width.
12848
12849Padding is done using the specified fill character (default is a space).
12850[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851
12852static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012853unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12854/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012856 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857 return NULL;
12858
Victor Stinnerc4b49542011-12-11 22:44:26 +010012859 if (PyUnicode_GET_LENGTH(self) >= width)
12860 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861
Victor Stinnerc4b49542011-12-11 22:44:26 +010012862 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863}
12864
Alexander Belopolsky40018472011-02-26 01:02:56 +000012865PyObject *
12866PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012868 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012871 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872}
12873
INADA Naoki3ae20562017-01-16 20:41:20 +090012874/*[clinic input]
12875str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
INADA Naoki3ae20562017-01-16 20:41:20 +090012877 sep: object = None
12878 The delimiter according which to split the string.
12879 None (the default value) means split according to any whitespace,
12880 and discard empty strings from the result.
12881 maxsplit: Py_ssize_t = -1
12882 Maximum number of splits to do.
12883 -1 (the default value) means no limit.
12884
12885Return a list of the words in the string, using sep as the delimiter string.
12886[clinic start generated code]*/
12887
12888static PyObject *
12889unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12890/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891{
INADA Naoki3ae20562017-01-16 20:41:20 +090012892 if (sep == Py_None)
12893 return split(self, NULL, maxsplit);
12894 if (PyUnicode_Check(sep))
12895 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012896
12897 PyErr_Format(PyExc_TypeError,
12898 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090012899 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901}
12902
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012904PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012905{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012907 int kind1, kind2;
12908 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012911 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012913
Victor Stinner14f8f022011-10-05 20:58:25 +020012914 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 len1 = PyUnicode_GET_LENGTH(str_obj);
12917 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012918 if (kind1 < kind2 || len1 < len2) {
12919 _Py_INCREF_UNICODE_EMPTY();
12920 if (!unicode_empty)
12921 out = NULL;
12922 else {
12923 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12924 Py_DECREF(unicode_empty);
12925 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012926 return out;
12927 }
12928 buf1 = PyUnicode_DATA(str_obj);
12929 buf2 = PyUnicode_DATA(sep_obj);
12930 if (kind2 != kind1) {
12931 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12932 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012933 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012936 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012938 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12939 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12940 else
12941 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 break;
12943 case PyUnicode_2BYTE_KIND:
12944 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12945 break;
12946 case PyUnicode_4BYTE_KIND:
12947 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12948 break;
12949 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012950 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012952
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012953 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012955
12956 return out;
12957}
12958
12959
12960PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012961PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012962{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012963 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012964 int kind1, kind2;
12965 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012967
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012968 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012971 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 len1 = PyUnicode_GET_LENGTH(str_obj);
12974 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012975 if (kind1 < kind2 || len1 < len2) {
12976 _Py_INCREF_UNICODE_EMPTY();
12977 if (!unicode_empty)
12978 out = NULL;
12979 else {
12980 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12981 Py_DECREF(unicode_empty);
12982 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012983 return out;
12984 }
12985 buf1 = PyUnicode_DATA(str_obj);
12986 buf2 = PyUnicode_DATA(sep_obj);
12987 if (kind2 != kind1) {
12988 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12989 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012990 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012993 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012995 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12996 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12997 else
12998 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 break;
13000 case PyUnicode_2BYTE_KIND:
13001 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13002 break;
13003 case PyUnicode_4BYTE_KIND:
13004 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13005 break;
13006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013007 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013012
13013 return out;
13014}
13015
INADA Naoki3ae20562017-01-16 20:41:20 +090013016/*[clinic input]
13017str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013018
INADA Naoki3ae20562017-01-16 20:41:20 +090013019 sep: object
13020 /
13021
13022Partition the string into three parts using the given separator.
13023
13024This will search for the separator in the string. If the separator is found,
13025returns a 3-tuple containing the part before the separator, the separator
13026itself, and the part after it.
13027
13028If the separator is not found, returns a 3-tuple containing the original string
13029and two empty strings.
13030[clinic start generated code]*/
13031
13032static PyObject *
13033unicode_partition(PyObject *self, PyObject *sep)
13034/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035{
INADA Naoki3ae20562017-01-16 20:41:20 +090013036 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037}
13038
INADA Naoki3ae20562017-01-16 20:41:20 +090013039/*[clinic input]
13040str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013041
INADA Naoki3ae20562017-01-16 20:41:20 +090013042Partition the string into three parts using the given separator.
13043
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013044This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013045the separator is found, returns a 3-tuple containing the part before the
13046separator, the separator itself, and the part after it.
13047
13048If the separator is not found, returns a 3-tuple containing two empty strings
13049and the original string.
13050[clinic start generated code]*/
13051
13052static PyObject *
13053unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013054/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055{
INADA Naoki3ae20562017-01-16 20:41:20 +090013056 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013057}
13058
Alexander Belopolsky40018472011-02-26 01:02:56 +000013059PyObject *
13060PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013061{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013062 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013064
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013065 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013066}
13067
INADA Naoki3ae20562017-01-16 20:41:20 +090013068/*[clinic input]
13069str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013070
INADA Naoki3ae20562017-01-16 20:41:20 +090013071Return a list of the words in the string, using sep as the delimiter string.
13072
13073Splits are done starting at the end of the string and working to the front.
13074[clinic start generated code]*/
13075
13076static PyObject *
13077unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13078/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013079{
INADA Naoki3ae20562017-01-16 20:41:20 +090013080 if (sep == Py_None)
13081 return rsplit(self, NULL, maxsplit);
13082 if (PyUnicode_Check(sep))
13083 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013084
13085 PyErr_Format(PyExc_TypeError,
13086 "must be str or None, not %.100s",
INADA Naoki3ae20562017-01-16 20:41:20 +090013087 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013088 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013089}
13090
INADA Naoki3ae20562017-01-16 20:41:20 +090013091/*[clinic input]
13092str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013094 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013095
13096Return a list of the lines in the string, breaking at line boundaries.
13097
13098Line breaks are not included in the resulting list unless keepends is given and
13099true.
13100[clinic start generated code]*/
13101
13102static PyObject *
13103unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013104/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013106 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107}
13108
13109static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013110PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013112 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113}
13114
INADA Naoki3ae20562017-01-16 20:41:20 +090013115/*[clinic input]
13116str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
INADA Naoki3ae20562017-01-16 20:41:20 +090013118Convert uppercase characters to lowercase and lowercase characters to uppercase.
13119[clinic start generated code]*/
13120
13121static PyObject *
13122unicode_swapcase_impl(PyObject *self)
13123/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013125 if (PyUnicode_READY(self) == -1)
13126 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013127 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128}
13129
Larry Hastings61272b72014-01-07 12:41:53 -080013130/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013131
Larry Hastings31826802013-10-19 00:09:25 -070013132@staticmethod
13133str.maketrans as unicode_maketrans
13134
13135 x: object
13136
13137 y: unicode=NULL
13138
13139 z: unicode=NULL
13140
13141 /
13142
13143Return a translation table usable for str.translate().
13144
13145If there is only one argument, it must be a dictionary mapping Unicode
13146ordinals (integers) or characters to Unicode ordinals, strings or None.
13147Character keys will be then converted to ordinals.
13148If there are two arguments, they must be strings of equal length, and
13149in the resulting dictionary, each character in x will be mapped to the
13150character at the same position in y. If there is a third argument, it
13151must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013152[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013153
Larry Hastings31826802013-10-19 00:09:25 -070013154static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013155unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013156/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013157{
Georg Brandlceee0772007-11-27 23:48:05 +000013158 PyObject *new = NULL, *key, *value;
13159 Py_ssize_t i = 0;
13160 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161
Georg Brandlceee0772007-11-27 23:48:05 +000013162 new = PyDict_New();
13163 if (!new)
13164 return NULL;
13165 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 int x_kind, y_kind, z_kind;
13167 void *x_data, *y_data, *z_data;
13168
Georg Brandlceee0772007-11-27 23:48:05 +000013169 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013170 if (!PyUnicode_Check(x)) {
13171 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13172 "be a string if there is a second argument");
13173 goto err;
13174 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013176 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13177 "arguments must have equal length");
13178 goto err;
13179 }
13180 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 x_kind = PyUnicode_KIND(x);
13182 y_kind = PyUnicode_KIND(y);
13183 x_data = PyUnicode_DATA(x);
13184 y_data = PyUnicode_DATA(y);
13185 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13186 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013187 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013188 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013189 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013190 if (!value) {
13191 Py_DECREF(key);
13192 goto err;
13193 }
Georg Brandlceee0772007-11-27 23:48:05 +000013194 res = PyDict_SetItem(new, key, value);
13195 Py_DECREF(key);
13196 Py_DECREF(value);
13197 if (res < 0)
13198 goto err;
13199 }
13200 /* create entries for deleting chars in z */
13201 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013202 z_kind = PyUnicode_KIND(z);
13203 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013204 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013206 if (!key)
13207 goto err;
13208 res = PyDict_SetItem(new, key, Py_None);
13209 Py_DECREF(key);
13210 if (res < 0)
13211 goto err;
13212 }
13213 }
13214 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 int kind;
13216 void *data;
13217
Georg Brandlceee0772007-11-27 23:48:05 +000013218 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013219 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013220 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13221 "to maketrans it must be a dict");
13222 goto err;
13223 }
13224 /* copy entries into the new dict, converting string keys to int keys */
13225 while (PyDict_Next(x, &i, &key, &value)) {
13226 if (PyUnicode_Check(key)) {
13227 /* convert string keys to integer keys */
13228 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013229 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013230 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13231 "table must be of length 1");
13232 goto err;
13233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 kind = PyUnicode_KIND(key);
13235 data = PyUnicode_DATA(key);
13236 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013237 if (!newkey)
13238 goto err;
13239 res = PyDict_SetItem(new, newkey, value);
13240 Py_DECREF(newkey);
13241 if (res < 0)
13242 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013243 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013244 /* just keep integer keys */
13245 if (PyDict_SetItem(new, key, value) < 0)
13246 goto err;
13247 } else {
13248 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13249 "be strings or integers");
13250 goto err;
13251 }
13252 }
13253 }
13254 return new;
13255 err:
13256 Py_DECREF(new);
13257 return NULL;
13258}
13259
INADA Naoki3ae20562017-01-16 20:41:20 +090013260/*[clinic input]
13261str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262
INADA Naoki3ae20562017-01-16 20:41:20 +090013263 table: object
13264 Translation table, which must be a mapping of Unicode ordinals to
13265 Unicode ordinals, strings, or None.
13266 /
13267
13268Replace each character in the string using the given translation table.
13269
13270The table must implement lookup/indexing via __getitem__, for instance a
13271dictionary or list. If this operation raises LookupError, the character is
13272left untouched. Characters mapped to None are deleted.
13273[clinic start generated code]*/
13274
13275static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013277/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280}
13281
INADA Naoki3ae20562017-01-16 20:41:20 +090013282/*[clinic input]
13283str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
INADA Naoki3ae20562017-01-16 20:41:20 +090013285Return a copy of the string converted to uppercase.
13286[clinic start generated code]*/
13287
13288static PyObject *
13289unicode_upper_impl(PyObject *self)
13290/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013292 if (PyUnicode_READY(self) == -1)
13293 return NULL;
13294 if (PyUnicode_IS_ASCII(self))
13295 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013296 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297}
13298
INADA Naoki3ae20562017-01-16 20:41:20 +090013299/*[clinic input]
13300str.zfill as unicode_zfill
13301
13302 width: Py_ssize_t
13303 /
13304
13305Pad a numeric string with zeros on the left, to fill a field of the given width.
13306
13307The string is never truncated.
13308[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
13310static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013311unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013312/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013314 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013315 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 int kind;
13317 void *data;
13318 Py_UCS4 chr;
13319
Benjamin Petersonbac79492012-01-14 13:34:47 -050013320 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322
Victor Stinnerc4b49542011-12-11 22:44:26 +010013323 if (PyUnicode_GET_LENGTH(self) >= width)
13324 return unicode_result_unchanged(self);
13325
13326 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
13328 u = pad(self, fill, 0, '0');
13329
Walter Dörwald068325e2002-04-15 13:36:47 +000013330 if (u == NULL)
13331 return NULL;
13332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 kind = PyUnicode_KIND(u);
13334 data = PyUnicode_DATA(u);
13335 chr = PyUnicode_READ(kind, data, fill);
13336
13337 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 PyUnicode_WRITE(kind, data, 0, chr);
13340 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341 }
13342
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013343 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013344 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
13347#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013348static PyObject *
13349unicode__decimal2ascii(PyObject *self)
13350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013352}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353#endif
13354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013355PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013358Return True if S starts with the specified prefix, False otherwise.\n\
13359With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013360With optional end, stop comparing S at that position.\n\
13361prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362
13363static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013364unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013367 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013368 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013369 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013370 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013371 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
Jesus Ceaac451502011-04-20 17:09:23 +020013373 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013375 if (PyTuple_Check(subobj)) {
13376 Py_ssize_t i;
13377 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013378 substring = PyTuple_GET_ITEM(subobj, i);
13379 if (!PyUnicode_Check(substring)) {
13380 PyErr_Format(PyExc_TypeError,
13381 "tuple for startswith must only contain str, "
13382 "not %.100s",
13383 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013385 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013387 if (result == -1)
13388 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013389 if (result) {
13390 Py_RETURN_TRUE;
13391 }
13392 }
13393 /* nothing matched */
13394 Py_RETURN_FALSE;
13395 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 if (!PyUnicode_Check(subobj)) {
13397 PyErr_Format(PyExc_TypeError,
13398 "startswith first arg must be str or "
13399 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013400 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013401 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013403 if (result == -1)
13404 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013405 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406}
13407
13408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013409PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013412Return True if S ends with the specified suffix, False otherwise.\n\
13413With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013414With optional end, stop comparing S at that position.\n\
13415suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416
13417static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013418unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013421 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013422 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013423 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013424 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013425 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426
Jesus Ceaac451502011-04-20 17:09:23 +020013427 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013429 if (PyTuple_Check(subobj)) {
13430 Py_ssize_t i;
13431 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013432 substring = PyTuple_GET_ITEM(subobj, i);
13433 if (!PyUnicode_Check(substring)) {
13434 PyErr_Format(PyExc_TypeError,
13435 "tuple for endswith must only contain str, "
13436 "not %.100s",
13437 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013439 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013441 if (result == -1)
13442 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013443 if (result) {
13444 Py_RETURN_TRUE;
13445 }
13446 }
13447 Py_RETURN_FALSE;
13448 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013449 if (!PyUnicode_Check(subobj)) {
13450 PyErr_Format(PyExc_TypeError,
13451 "endswith first arg must be str or "
13452 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013454 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013455 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013456 if (result == -1)
13457 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459}
13460
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013461static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013462_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013463{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013464 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13465 writer->data = PyUnicode_DATA(writer->buffer);
13466
13467 if (!writer->readonly) {
13468 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013469 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013470 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013471 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013472 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13473 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13474 writer->kind = PyUnicode_WCHAR_KIND;
13475 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13476
Victor Stinner8f674cc2013-04-17 23:02:17 +020013477 /* Copy-on-write mode: set buffer size to 0 so
13478 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13479 * next write. */
13480 writer->size = 0;
13481 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013482}
13483
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013485_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013486{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013488
13489 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013490 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013491
13492 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13493 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13494 writer->kind = PyUnicode_WCHAR_KIND;
13495 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013496}
13497
Victor Stinnerd3f08822012-05-29 12:57:52 +020013498int
13499_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13500 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013501{
13502 Py_ssize_t newlen;
13503 PyObject *newbuffer;
13504
Victor Stinner2740e462016-09-06 16:58:36 -070013505 assert(maxchar <= MAX_UNICODE);
13506
Victor Stinnerca9381e2015-09-22 00:58:32 +020013507 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013508 assert((maxchar > writer->maxchar && length >= 0)
13509 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510
Victor Stinner202fdca2012-05-07 12:47:02 +020013511 if (length > PY_SSIZE_T_MAX - writer->pos) {
13512 PyErr_NoMemory();
13513 return -1;
13514 }
13515 newlen = writer->pos + length;
13516
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013517 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013518
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013520 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013521 if (writer->overallocate
13522 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13523 /* overallocate to limit the number of realloc() */
13524 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013526 if (newlen < writer->min_length)
13527 newlen = writer->min_length;
13528
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529 writer->buffer = PyUnicode_New(newlen, maxchar);
13530 if (writer->buffer == NULL)
13531 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013533 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013534 if (writer->overallocate
13535 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13536 /* overallocate to limit the number of realloc() */
13537 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013539 if (newlen < writer->min_length)
13540 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013542 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013543 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013544 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013545 newbuffer = PyUnicode_New(newlen, maxchar);
13546 if (newbuffer == NULL)
13547 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013548 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13549 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013551 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013552 }
13553 else {
13554 newbuffer = resize_compact(writer->buffer, newlen);
13555 if (newbuffer == NULL)
13556 return -1;
13557 }
13558 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013559 }
13560 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013561 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562 newbuffer = PyUnicode_New(writer->size, maxchar);
13563 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013564 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13566 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013567 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013568 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013569 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013570 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013571
13572#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013573}
13574
Victor Stinnerca9381e2015-09-22 00:58:32 +020013575int
13576_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13577 enum PyUnicode_Kind kind)
13578{
13579 Py_UCS4 maxchar;
13580
13581 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13582 assert(writer->kind < kind);
13583
13584 switch (kind)
13585 {
13586 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13587 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13588 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13589 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013590 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013591 }
13592
13593 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13594}
13595
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013596static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013597_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013598{
Victor Stinner2740e462016-09-06 16:58:36 -070013599 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013600 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13601 return -1;
13602 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13603 writer->pos++;
13604 return 0;
13605}
13606
13607int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013608_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13609{
13610 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13611}
13612
13613int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013614_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13615{
13616 Py_UCS4 maxchar;
13617 Py_ssize_t len;
13618
13619 if (PyUnicode_READY(str) == -1)
13620 return -1;
13621 len = PyUnicode_GET_LENGTH(str);
13622 if (len == 0)
13623 return 0;
13624 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13625 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013626 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013627 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013628 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013629 Py_INCREF(str);
13630 writer->buffer = str;
13631 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013632 writer->pos += len;
13633 return 0;
13634 }
13635 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13636 return -1;
13637 }
13638 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13639 str, 0, len);
13640 writer->pos += len;
13641 return 0;
13642}
13643
Victor Stinnere215d962012-10-06 23:03:36 +020013644int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013645_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13646 Py_ssize_t start, Py_ssize_t end)
13647{
13648 Py_UCS4 maxchar;
13649 Py_ssize_t len;
13650
13651 if (PyUnicode_READY(str) == -1)
13652 return -1;
13653
13654 assert(0 <= start);
13655 assert(end <= PyUnicode_GET_LENGTH(str));
13656 assert(start <= end);
13657
13658 if (end == 0)
13659 return 0;
13660
13661 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13662 return _PyUnicodeWriter_WriteStr(writer, str);
13663
13664 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13665 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13666 else
13667 maxchar = writer->maxchar;
13668 len = end - start;
13669
13670 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13671 return -1;
13672
13673 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13674 str, start, len);
13675 writer->pos += len;
13676 return 0;
13677}
13678
13679int
Victor Stinner4a587072013-11-19 12:54:53 +010013680_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13681 const char *ascii, Py_ssize_t len)
13682{
13683 if (len == -1)
13684 len = strlen(ascii);
13685
13686 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13687
13688 if (writer->buffer == NULL && !writer->overallocate) {
13689 PyObject *str;
13690
13691 str = _PyUnicode_FromASCII(ascii, len);
13692 if (str == NULL)
13693 return -1;
13694
13695 writer->readonly = 1;
13696 writer->buffer = str;
13697 _PyUnicodeWriter_Update(writer);
13698 writer->pos += len;
13699 return 0;
13700 }
13701
13702 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13703 return -1;
13704
13705 switch (writer->kind)
13706 {
13707 case PyUnicode_1BYTE_KIND:
13708 {
13709 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13710 Py_UCS1 *data = writer->data;
13711
Christian Heimesf051e432016-09-13 20:22:02 +020013712 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013713 break;
13714 }
13715 case PyUnicode_2BYTE_KIND:
13716 {
13717 _PyUnicode_CONVERT_BYTES(
13718 Py_UCS1, Py_UCS2,
13719 ascii, ascii + len,
13720 (Py_UCS2 *)writer->data + writer->pos);
13721 break;
13722 }
13723 case PyUnicode_4BYTE_KIND:
13724 {
13725 _PyUnicode_CONVERT_BYTES(
13726 Py_UCS1, Py_UCS4,
13727 ascii, ascii + len,
13728 (Py_UCS4 *)writer->data + writer->pos);
13729 break;
13730 }
13731 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013732 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013733 }
13734
13735 writer->pos += len;
13736 return 0;
13737}
13738
13739int
13740_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13741 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013742{
13743 Py_UCS4 maxchar;
13744
13745 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13746 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13747 return -1;
13748 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13749 writer->pos += len;
13750 return 0;
13751}
13752
Victor Stinnerd3f08822012-05-29 12:57:52 +020013753PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013754_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013755{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013756 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013757
Victor Stinnerd3f08822012-05-29 12:57:52 +020013758 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013759 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013760 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013761 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013762
13763 str = writer->buffer;
13764 writer->buffer = NULL;
13765
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013766 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013767 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13768 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013769 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013770
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013771 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13772 PyObject *str2;
13773 str2 = resize_compact(str, writer->pos);
13774 if (str2 == NULL) {
13775 Py_DECREF(str);
13776 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013777 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013778 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013779 }
13780
Victor Stinner15a0bd32013-07-08 22:29:55 +020013781 assert(_PyUnicode_CheckConsistency(str, 1));
13782 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013783}
13784
Victor Stinnerd3f08822012-05-29 12:57:52 +020013785void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013786_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013787{
13788 Py_CLEAR(writer->buffer);
13789}
13790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013792
13793PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013795\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013796Return a formatted version of S, using substitutions from args and kwargs.\n\
13797The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013798
Eric Smith27bbca62010-11-04 17:06:58 +000013799PyDoc_STRVAR(format_map__doc__,
13800 "S.format_map(mapping) -> str\n\
13801\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013802Return a formatted version of S, using substitutions from mapping.\n\
13803The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013804
INADA Naoki3ae20562017-01-16 20:41:20 +090013805/*[clinic input]
13806str.__format__ as unicode___format__
13807
13808 format_spec: unicode
13809 /
13810
13811Return a formatted version of the string as described by format_spec.
13812[clinic start generated code]*/
13813
Eric Smith4a7d76d2008-05-30 18:10:19 +000013814static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013815unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013816/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013817{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013818 _PyUnicodeWriter writer;
13819 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013820
Victor Stinnerd3f08822012-05-29 12:57:52 +020013821 if (PyUnicode_READY(self) == -1)
13822 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013823 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013824 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13825 self, format_spec, 0,
13826 PyUnicode_GET_LENGTH(format_spec));
13827 if (ret == -1) {
13828 _PyUnicodeWriter_Dealloc(&writer);
13829 return NULL;
13830 }
13831 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013832}
13833
INADA Naoki3ae20562017-01-16 20:41:20 +090013834/*[clinic input]
13835str.__sizeof__ as unicode_sizeof
13836
13837Return the size of the string in memory, in bytes.
13838[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013839
13840static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013841unicode_sizeof_impl(PyObject *self)
13842/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013843{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013844 Py_ssize_t size;
13845
13846 /* If it's a compact object, account for base structure +
13847 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013848 if (PyUnicode_IS_COMPACT_ASCII(self))
13849 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13850 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013851 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013852 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013853 else {
13854 /* If it is a two-block object, account for base object, and
13855 for character block if present. */
13856 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013857 if (_PyUnicode_DATA_ANY(self))
13858 size += (PyUnicode_GET_LENGTH(self) + 1) *
13859 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013860 }
13861 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013862 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013863 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13864 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13865 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13866 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013867
13868 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013869}
13870
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013871static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013872unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013873{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013874 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 if (!copy)
13876 return NULL;
13877 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013878}
13879
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013881 UNICODE_ENCODE_METHODDEF
13882 UNICODE_REPLACE_METHODDEF
13883 UNICODE_SPLIT_METHODDEF
13884 UNICODE_RSPLIT_METHODDEF
13885 UNICODE_JOIN_METHODDEF
13886 UNICODE_CAPITALIZE_METHODDEF
13887 UNICODE_CASEFOLD_METHODDEF
13888 UNICODE_TITLE_METHODDEF
13889 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013890 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013891 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013892 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013893 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013894 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013895 UNICODE_LJUST_METHODDEF
13896 UNICODE_LOWER_METHODDEF
13897 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013898 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13899 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013900 UNICODE_RJUST_METHODDEF
13901 UNICODE_RSTRIP_METHODDEF
13902 UNICODE_RPARTITION_METHODDEF
13903 UNICODE_SPLITLINES_METHODDEF
13904 UNICODE_STRIP_METHODDEF
13905 UNICODE_SWAPCASE_METHODDEF
13906 UNICODE_TRANSLATE_METHODDEF
13907 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013908 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13909 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013910 UNICODE_ISLOWER_METHODDEF
13911 UNICODE_ISUPPER_METHODDEF
13912 UNICODE_ISTITLE_METHODDEF
13913 UNICODE_ISSPACE_METHODDEF
13914 UNICODE_ISDECIMAL_METHODDEF
13915 UNICODE_ISDIGIT_METHODDEF
13916 UNICODE_ISNUMERIC_METHODDEF
13917 UNICODE_ISALPHA_METHODDEF
13918 UNICODE_ISALNUM_METHODDEF
13919 UNICODE_ISIDENTIFIER_METHODDEF
13920 UNICODE_ISPRINTABLE_METHODDEF
13921 UNICODE_ZFILL_METHODDEF
Eric Smith9cd1e092007-08-31 18:39:38 +000013922 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013923 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013924 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013925 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013926 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013927#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013928 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013929 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930#endif
13931
Benjamin Peterson14339b62009-01-31 16:36:08 +000013932 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 {NULL, NULL}
13934};
13935
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013936static PyObject *
13937unicode_mod(PyObject *v, PyObject *w)
13938{
Brian Curtindfc80e32011-08-10 20:28:54 -050013939 if (!PyUnicode_Check(v))
13940 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013941 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013942}
13943
13944static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013945 0, /*nb_add*/
13946 0, /*nb_subtract*/
13947 0, /*nb_multiply*/
13948 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013949};
13950
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013952 (lenfunc) unicode_length, /* sq_length */
13953 PyUnicode_Concat, /* sq_concat */
13954 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13955 (ssizeargfunc) unicode_getitem, /* sq_item */
13956 0, /* sq_slice */
13957 0, /* sq_ass_item */
13958 0, /* sq_ass_slice */
13959 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013960};
13961
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013962static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013963unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013965 if (PyUnicode_READY(self) == -1)
13966 return NULL;
13967
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013968 if (PyIndex_Check(item)) {
13969 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013970 if (i == -1 && PyErr_Occurred())
13971 return NULL;
13972 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013973 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013974 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013975 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013976 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013977 PyObject *result;
13978 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013979 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013980 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013981
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013982 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013983 return NULL;
13984 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013985 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13986 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013987
13988 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013989 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013990 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013991 slicelength == PyUnicode_GET_LENGTH(self)) {
13992 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013993 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013994 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013995 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013997 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013998 src_kind = PyUnicode_KIND(self);
13999 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014000 if (!PyUnicode_IS_ASCII(self)) {
14001 kind_limit = kind_maxchar_limit(src_kind);
14002 max_char = 0;
14003 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14004 ch = PyUnicode_READ(src_kind, src_data, cur);
14005 if (ch > max_char) {
14006 max_char = ch;
14007 if (max_char >= kind_limit)
14008 break;
14009 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014010 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014011 }
Victor Stinner55c99112011-10-13 01:17:06 +020014012 else
14013 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014015 if (result == NULL)
14016 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014017 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014018 dest_data = PyUnicode_DATA(result);
14019
14020 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014021 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14022 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014023 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014024 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014025 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014026 } else {
14027 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14028 return NULL;
14029 }
14030}
14031
14032static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 (lenfunc)unicode_length, /* mp_length */
14034 (binaryfunc)unicode_subscript, /* mp_subscript */
14035 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014036};
14037
Guido van Rossumd57fd912000-03-10 22:53:23 +000014038
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039/* Helpers for PyUnicode_Format() */
14040
Victor Stinnera47082312012-10-04 02:19:54 +020014041struct unicode_formatter_t {
14042 PyObject *args;
14043 int args_owned;
14044 Py_ssize_t arglen, argidx;
14045 PyObject *dict;
14046
14047 enum PyUnicode_Kind fmtkind;
14048 Py_ssize_t fmtcnt, fmtpos;
14049 void *fmtdata;
14050 PyObject *fmtstr;
14051
14052 _PyUnicodeWriter writer;
14053};
14054
14055struct unicode_format_arg_t {
14056 Py_UCS4 ch;
14057 int flags;
14058 Py_ssize_t width;
14059 int prec;
14060 int sign;
14061};
14062
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014064unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014065{
Victor Stinnera47082312012-10-04 02:19:54 +020014066 Py_ssize_t argidx = ctx->argidx;
14067
14068 if (argidx < ctx->arglen) {
14069 ctx->argidx++;
14070 if (ctx->arglen < 0)
14071 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 else
Victor Stinnera47082312012-10-04 02:19:54 +020014073 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014074 }
14075 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014076 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077 return NULL;
14078}
14079
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014080/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081
Victor Stinnera47082312012-10-04 02:19:54 +020014082/* Format a float into the writer if the writer is not NULL, or into *p_output
14083 otherwise.
14084
14085 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014086static int
Victor Stinnera47082312012-10-04 02:19:54 +020014087formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14088 PyObject **p_output,
14089 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014091 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014093 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014094 int prec;
14095 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014096
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097 x = PyFloat_AsDouble(v);
14098 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014100
Victor Stinnera47082312012-10-04 02:19:54 +020014101 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014102 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014104
Victor Stinnera47082312012-10-04 02:19:54 +020014105 if (arg->flags & F_ALT)
14106 dtoa_flags = Py_DTSF_ALT;
14107 else
14108 dtoa_flags = 0;
14109 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014110 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014111 return -1;
14112 len = strlen(p);
14113 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014114 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014115 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014117 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118 }
14119 else
14120 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014121 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014122 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123}
14124
Victor Stinnerd0880d52012-04-27 23:40:13 +020014125/* formatlong() emulates the format codes d, u, o, x and X, and
14126 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14127 * Python's regular ints.
14128 * Return value: a new PyUnicodeObject*, or NULL if error.
14129 * The output string is of the form
14130 * "-"? ("0x" | "0X")? digit+
14131 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14132 * set in flags. The case of hex digits will be correct,
14133 * There will be at least prec digits, zero-filled on the left if
14134 * necessary to get that many.
14135 * val object to be converted
14136 * flags bitmask of format flags; only F_ALT is looked at
14137 * prec minimum number of digits; 0-fill on left if needed
14138 * type a character in [duoxX]; u acts the same as d
14139 *
14140 * CAUTION: o, x and X conversions on regular ints can never
14141 * produce a '-' sign, but can for Python's unbounded ints.
14142 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014143PyObject *
14144_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014145{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014148 Py_ssize_t i;
14149 int sign; /* 1 if '-', else 0 */
14150 int len; /* number of characters */
14151 Py_ssize_t llen;
14152 int numdigits; /* len == numnondigits + numdigits */
14153 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014154
Victor Stinnerd0880d52012-04-27 23:40:13 +020014155 /* Avoid exceeding SSIZE_T_MAX */
14156 if (prec > INT_MAX-3) {
14157 PyErr_SetString(PyExc_OverflowError,
14158 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014160 }
14161
14162 assert(PyLong_Check(val));
14163
14164 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014165 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014166 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014168 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014169 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014170 /* int and int subclasses should print numerically when a numeric */
14171 /* format code is used (see issue18780) */
14172 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 break;
14174 case 'o':
14175 numnondigits = 2;
14176 result = PyNumber_ToBase(val, 8);
14177 break;
14178 case 'x':
14179 case 'X':
14180 numnondigits = 2;
14181 result = PyNumber_ToBase(val, 16);
14182 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014183 }
14184 if (!result)
14185 return NULL;
14186
14187 assert(unicode_modifiable(result));
14188 assert(PyUnicode_IS_READY(result));
14189 assert(PyUnicode_IS_ASCII(result));
14190
14191 /* To modify the string in-place, there can only be one reference. */
14192 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014193 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 PyErr_BadInternalCall();
14195 return NULL;
14196 }
14197 buf = PyUnicode_DATA(result);
14198 llen = PyUnicode_GET_LENGTH(result);
14199 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014200 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014202 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014203 return NULL;
14204 }
14205 len = (int)llen;
14206 sign = buf[0] == '-';
14207 numnondigits += sign;
14208 numdigits = len - numnondigits;
14209 assert(numdigits > 0);
14210
14211 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014212 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014213 (type == 'o' || type == 'x' || type == 'X'))) {
14214 assert(buf[sign] == '0');
14215 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14216 buf[sign+1] == 'o');
14217 numnondigits -= 2;
14218 buf += 2;
14219 len -= 2;
14220 if (sign)
14221 buf[0] = '-';
14222 assert(len == numnondigits + numdigits);
14223 assert(numdigits > 0);
14224 }
14225
14226 /* Fill with leading zeroes to meet minimum width. */
14227 if (prec > numdigits) {
14228 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14229 numnondigits + prec);
14230 char *b1;
14231 if (!r1) {
14232 Py_DECREF(result);
14233 return NULL;
14234 }
14235 b1 = PyBytes_AS_STRING(r1);
14236 for (i = 0; i < numnondigits; ++i)
14237 *b1++ = *buf++;
14238 for (i = 0; i < prec - numdigits; i++)
14239 *b1++ = '0';
14240 for (i = 0; i < numdigits; i++)
14241 *b1++ = *buf++;
14242 *b1 = '\0';
14243 Py_DECREF(result);
14244 result = r1;
14245 buf = PyBytes_AS_STRING(result);
14246 len = numnondigits + prec;
14247 }
14248
14249 /* Fix up case for hex conversions. */
14250 if (type == 'X') {
14251 /* Need to convert all lower case letters to upper case.
14252 and need to convert 0x to 0X (and -0x to -0X). */
14253 for (i = 0; i < len; i++)
14254 if (buf[i] >= 'a' && buf[i] <= 'x')
14255 buf[i] -= 'a'-'A';
14256 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014257 if (!PyUnicode_Check(result)
14258 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014259 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014260 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014261 Py_DECREF(result);
14262 result = unicode;
14263 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 else if (len != PyUnicode_GET_LENGTH(result)) {
14265 if (PyUnicode_Resize(&result, len) < 0)
14266 Py_CLEAR(result);
14267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014269}
14270
Ethan Furmandf3ed242014-01-05 06:50:30 -080014271/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014272 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014273 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014274 * -1 and raise an exception on error */
14275static int
Victor Stinnera47082312012-10-04 02:19:54 +020014276mainformatlong(PyObject *v,
14277 struct unicode_format_arg_t *arg,
14278 PyObject **p_output,
14279 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280{
14281 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014282 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283
14284 if (!PyNumber_Check(v))
14285 goto wrongtype;
14286
Ethan Furman9ab74802014-03-21 06:38:46 -070014287 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014288 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014289 if (type == 'o' || type == 'x' || type == 'X') {
14290 iobj = PyNumber_Index(v);
14291 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014292 if (PyErr_ExceptionMatches(PyExc_TypeError))
14293 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014294 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014295 }
14296 }
14297 else {
14298 iobj = PyNumber_Long(v);
14299 if (iobj == NULL ) {
14300 if (PyErr_ExceptionMatches(PyExc_TypeError))
14301 goto wrongtype;
14302 return -1;
14303 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304 }
14305 assert(PyLong_Check(iobj));
14306 }
14307 else {
14308 iobj = v;
14309 Py_INCREF(iobj);
14310 }
14311
14312 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014313 && arg->width == -1 && arg->prec == -1
14314 && !(arg->flags & (F_SIGN | F_BLANK))
14315 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014316 {
14317 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014318 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014319 int base;
14320
Victor Stinnera47082312012-10-04 02:19:54 +020014321 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 {
14323 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014324 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014325 case 'd':
14326 case 'i':
14327 case 'u':
14328 base = 10;
14329 break;
14330 case 'o':
14331 base = 8;
14332 break;
14333 case 'x':
14334 case 'X':
14335 base = 16;
14336 break;
14337 }
14338
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014339 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14340 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014341 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014342 }
14343 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014344 return 1;
14345 }
14346
Ethan Furmanb95b5612015-01-23 20:05:18 -080014347 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014348 Py_DECREF(iobj);
14349 if (res == NULL)
14350 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014351 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014352 return 0;
14353
14354wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014355 switch(type)
14356 {
14357 case 'o':
14358 case 'x':
14359 case 'X':
14360 PyErr_Format(PyExc_TypeError,
14361 "%%%c format: an integer is required, "
14362 "not %.200s",
14363 type, Py_TYPE(v)->tp_name);
14364 break;
14365 default:
14366 PyErr_Format(PyExc_TypeError,
14367 "%%%c format: a number is required, "
14368 "not %.200s",
14369 type, Py_TYPE(v)->tp_name);
14370 break;
14371 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014372 return -1;
14373}
14374
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014375static Py_UCS4
14376formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014377{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014378 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014379 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014381 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014382 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014383 goto onError;
14384 }
14385 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014386 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014387 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014388 /* make sure number is a type of integer */
14389 if (!PyLong_Check(v)) {
14390 iobj = PyNumber_Index(v);
14391 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014392 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014394 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014395 Py_DECREF(iobj);
14396 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014397 else {
14398 x = PyLong_AsLong(v);
14399 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 if (x == -1 && PyErr_Occurred())
14401 goto onError;
14402
Victor Stinner8faf8212011-12-08 22:14:11 +010014403 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 PyErr_SetString(PyExc_OverflowError,
14405 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014406 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014407 }
14408
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014409 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014410 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014411
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014413 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014415 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014416}
14417
Victor Stinnera47082312012-10-04 02:19:54 +020014418/* Parse options of an argument: flags, width, precision.
14419 Handle also "%(name)" syntax.
14420
14421 Return 0 if the argument has been formatted into arg->str.
14422 Return 1 if the argument has been written into ctx->writer,
14423 Raise an exception and return -1 on error. */
14424static int
14425unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14426 struct unicode_format_arg_t *arg)
14427{
14428#define FORMAT_READ(ctx) \
14429 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14430
14431 PyObject *v;
14432
Victor Stinnera47082312012-10-04 02:19:54 +020014433 if (arg->ch == '(') {
14434 /* Get argument value from a dictionary. Example: "%(name)s". */
14435 Py_ssize_t keystart;
14436 Py_ssize_t keylen;
14437 PyObject *key;
14438 int pcount = 1;
14439
14440 if (ctx->dict == NULL) {
14441 PyErr_SetString(PyExc_TypeError,
14442 "format requires a mapping");
14443 return -1;
14444 }
14445 ++ctx->fmtpos;
14446 --ctx->fmtcnt;
14447 keystart = ctx->fmtpos;
14448 /* Skip over balanced parentheses */
14449 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14450 arg->ch = FORMAT_READ(ctx);
14451 if (arg->ch == ')')
14452 --pcount;
14453 else if (arg->ch == '(')
14454 ++pcount;
14455 ctx->fmtpos++;
14456 }
14457 keylen = ctx->fmtpos - keystart - 1;
14458 if (ctx->fmtcnt < 0 || pcount > 0) {
14459 PyErr_SetString(PyExc_ValueError,
14460 "incomplete format key");
14461 return -1;
14462 }
14463 key = PyUnicode_Substring(ctx->fmtstr,
14464 keystart, keystart + keylen);
14465 if (key == NULL)
14466 return -1;
14467 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014468 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014469 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014470 }
14471 ctx->args = PyObject_GetItem(ctx->dict, key);
14472 Py_DECREF(key);
14473 if (ctx->args == NULL)
14474 return -1;
14475 ctx->args_owned = 1;
14476 ctx->arglen = -1;
14477 ctx->argidx = -2;
14478 }
14479
14480 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014481 while (--ctx->fmtcnt >= 0) {
14482 arg->ch = FORMAT_READ(ctx);
14483 ctx->fmtpos++;
14484 switch (arg->ch) {
14485 case '-': arg->flags |= F_LJUST; continue;
14486 case '+': arg->flags |= F_SIGN; continue;
14487 case ' ': arg->flags |= F_BLANK; continue;
14488 case '#': arg->flags |= F_ALT; continue;
14489 case '0': arg->flags |= F_ZERO; continue;
14490 }
14491 break;
14492 }
14493
14494 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014495 if (arg->ch == '*') {
14496 v = unicode_format_getnextarg(ctx);
14497 if (v == NULL)
14498 return -1;
14499 if (!PyLong_Check(v)) {
14500 PyErr_SetString(PyExc_TypeError,
14501 "* wants int");
14502 return -1;
14503 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014504 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014505 if (arg->width == -1 && PyErr_Occurred())
14506 return -1;
14507 if (arg->width < 0) {
14508 arg->flags |= F_LJUST;
14509 arg->width = -arg->width;
14510 }
14511 if (--ctx->fmtcnt >= 0) {
14512 arg->ch = FORMAT_READ(ctx);
14513 ctx->fmtpos++;
14514 }
14515 }
14516 else if (arg->ch >= '0' && arg->ch <= '9') {
14517 arg->width = arg->ch - '0';
14518 while (--ctx->fmtcnt >= 0) {
14519 arg->ch = FORMAT_READ(ctx);
14520 ctx->fmtpos++;
14521 if (arg->ch < '0' || arg->ch > '9')
14522 break;
14523 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14524 mixing signed and unsigned comparison. Since arg->ch is between
14525 '0' and '9', casting to int is safe. */
14526 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14527 PyErr_SetString(PyExc_ValueError,
14528 "width too big");
14529 return -1;
14530 }
14531 arg->width = arg->width*10 + (arg->ch - '0');
14532 }
14533 }
14534
14535 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014536 if (arg->ch == '.') {
14537 arg->prec = 0;
14538 if (--ctx->fmtcnt >= 0) {
14539 arg->ch = FORMAT_READ(ctx);
14540 ctx->fmtpos++;
14541 }
14542 if (arg->ch == '*') {
14543 v = unicode_format_getnextarg(ctx);
14544 if (v == NULL)
14545 return -1;
14546 if (!PyLong_Check(v)) {
14547 PyErr_SetString(PyExc_TypeError,
14548 "* wants int");
14549 return -1;
14550 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014551 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014552 if (arg->prec == -1 && PyErr_Occurred())
14553 return -1;
14554 if (arg->prec < 0)
14555 arg->prec = 0;
14556 if (--ctx->fmtcnt >= 0) {
14557 arg->ch = FORMAT_READ(ctx);
14558 ctx->fmtpos++;
14559 }
14560 }
14561 else if (arg->ch >= '0' && arg->ch <= '9') {
14562 arg->prec = arg->ch - '0';
14563 while (--ctx->fmtcnt >= 0) {
14564 arg->ch = FORMAT_READ(ctx);
14565 ctx->fmtpos++;
14566 if (arg->ch < '0' || arg->ch > '9')
14567 break;
14568 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14569 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014570 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014571 return -1;
14572 }
14573 arg->prec = arg->prec*10 + (arg->ch - '0');
14574 }
14575 }
14576 }
14577
14578 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14579 if (ctx->fmtcnt >= 0) {
14580 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14581 if (--ctx->fmtcnt >= 0) {
14582 arg->ch = FORMAT_READ(ctx);
14583 ctx->fmtpos++;
14584 }
14585 }
14586 }
14587 if (ctx->fmtcnt < 0) {
14588 PyErr_SetString(PyExc_ValueError,
14589 "incomplete format");
14590 return -1;
14591 }
14592 return 0;
14593
14594#undef FORMAT_READ
14595}
14596
14597/* Format one argument. Supported conversion specifiers:
14598
14599 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014600 - "i", "d", "u": int or float
14601 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014602 - "e", "E", "f", "F", "g", "G": float
14603 - "c": int or str (1 character)
14604
Victor Stinner8dbd4212012-12-04 09:30:24 +010014605 When possible, the output is written directly into the Unicode writer
14606 (ctx->writer). A string is created when padding is required.
14607
Victor Stinnera47082312012-10-04 02:19:54 +020014608 Return 0 if the argument has been formatted into *p_str,
14609 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014610 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014611static int
14612unicode_format_arg_format(struct unicode_formatter_t *ctx,
14613 struct unicode_format_arg_t *arg,
14614 PyObject **p_str)
14615{
14616 PyObject *v;
14617 _PyUnicodeWriter *writer = &ctx->writer;
14618
14619 if (ctx->fmtcnt == 0)
14620 ctx->writer.overallocate = 0;
14621
Victor Stinnera47082312012-10-04 02:19:54 +020014622 v = unicode_format_getnextarg(ctx);
14623 if (v == NULL)
14624 return -1;
14625
Victor Stinnera47082312012-10-04 02:19:54 +020014626
14627 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014628 case 's':
14629 case 'r':
14630 case 'a':
14631 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14632 /* Fast path */
14633 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14634 return -1;
14635 return 1;
14636 }
14637
14638 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14639 *p_str = v;
14640 Py_INCREF(*p_str);
14641 }
14642 else {
14643 if (arg->ch == 's')
14644 *p_str = PyObject_Str(v);
14645 else if (arg->ch == 'r')
14646 *p_str = PyObject_Repr(v);
14647 else
14648 *p_str = PyObject_ASCII(v);
14649 }
14650 break;
14651
14652 case 'i':
14653 case 'd':
14654 case 'u':
14655 case 'o':
14656 case 'x':
14657 case 'X':
14658 {
14659 int ret = mainformatlong(v, arg, p_str, writer);
14660 if (ret != 0)
14661 return ret;
14662 arg->sign = 1;
14663 break;
14664 }
14665
14666 case 'e':
14667 case 'E':
14668 case 'f':
14669 case 'F':
14670 case 'g':
14671 case 'G':
14672 if (arg->width == -1 && arg->prec == -1
14673 && !(arg->flags & (F_SIGN | F_BLANK)))
14674 {
14675 /* Fast path */
14676 if (formatfloat(v, arg, NULL, writer) == -1)
14677 return -1;
14678 return 1;
14679 }
14680
14681 arg->sign = 1;
14682 if (formatfloat(v, arg, p_str, NULL) == -1)
14683 return -1;
14684 break;
14685
14686 case 'c':
14687 {
14688 Py_UCS4 ch = formatchar(v);
14689 if (ch == (Py_UCS4) -1)
14690 return -1;
14691 if (arg->width == -1 && arg->prec == -1) {
14692 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014693 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014694 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014695 return 1;
14696 }
14697 *p_str = PyUnicode_FromOrdinal(ch);
14698 break;
14699 }
14700
14701 default:
14702 PyErr_Format(PyExc_ValueError,
14703 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014704 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014705 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14706 (int)arg->ch,
14707 ctx->fmtpos - 1);
14708 return -1;
14709 }
14710 if (*p_str == NULL)
14711 return -1;
14712 assert (PyUnicode_Check(*p_str));
14713 return 0;
14714}
14715
14716static int
14717unicode_format_arg_output(struct unicode_formatter_t *ctx,
14718 struct unicode_format_arg_t *arg,
14719 PyObject *str)
14720{
14721 Py_ssize_t len;
14722 enum PyUnicode_Kind kind;
14723 void *pbuf;
14724 Py_ssize_t pindex;
14725 Py_UCS4 signchar;
14726 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014727 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014728 Py_ssize_t sublen;
14729 _PyUnicodeWriter *writer = &ctx->writer;
14730 Py_UCS4 fill;
14731
14732 fill = ' ';
14733 if (arg->sign && arg->flags & F_ZERO)
14734 fill = '0';
14735
14736 if (PyUnicode_READY(str) == -1)
14737 return -1;
14738
14739 len = PyUnicode_GET_LENGTH(str);
14740 if ((arg->width == -1 || arg->width <= len)
14741 && (arg->prec == -1 || arg->prec >= len)
14742 && !(arg->flags & (F_SIGN | F_BLANK)))
14743 {
14744 /* Fast path */
14745 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14746 return -1;
14747 return 0;
14748 }
14749
14750 /* Truncate the string for "s", "r" and "a" formats
14751 if the precision is set */
14752 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14753 if (arg->prec >= 0 && len > arg->prec)
14754 len = arg->prec;
14755 }
14756
14757 /* Adjust sign and width */
14758 kind = PyUnicode_KIND(str);
14759 pbuf = PyUnicode_DATA(str);
14760 pindex = 0;
14761 signchar = '\0';
14762 if (arg->sign) {
14763 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14764 if (ch == '-' || ch == '+') {
14765 signchar = ch;
14766 len--;
14767 pindex++;
14768 }
14769 else if (arg->flags & F_SIGN)
14770 signchar = '+';
14771 else if (arg->flags & F_BLANK)
14772 signchar = ' ';
14773 else
14774 arg->sign = 0;
14775 }
14776 if (arg->width < len)
14777 arg->width = len;
14778
14779 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014780 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014781 if (!(arg->flags & F_LJUST)) {
14782 if (arg->sign) {
14783 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014784 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014785 }
14786 else {
14787 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014788 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014789 }
14790 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014791 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14792 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014793 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014794 }
14795
Victor Stinnera47082312012-10-04 02:19:54 +020014796 buflen = arg->width;
14797 if (arg->sign && len == arg->width)
14798 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014799 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014800 return -1;
14801
14802 /* Write the sign if needed */
14803 if (arg->sign) {
14804 if (fill != ' ') {
14805 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14806 writer->pos += 1;
14807 }
14808 if (arg->width > len)
14809 arg->width--;
14810 }
14811
14812 /* Write the numeric prefix for "x", "X" and "o" formats
14813 if the alternate form is used.
14814 For example, write "0x" for the "%#x" format. */
14815 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14816 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14817 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14818 if (fill != ' ') {
14819 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14820 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14821 writer->pos += 2;
14822 pindex += 2;
14823 }
14824 arg->width -= 2;
14825 if (arg->width < 0)
14826 arg->width = 0;
14827 len -= 2;
14828 }
14829
14830 /* Pad left with the fill character if needed */
14831 if (arg->width > len && !(arg->flags & F_LJUST)) {
14832 sublen = arg->width - len;
14833 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14834 writer->pos += sublen;
14835 arg->width = len;
14836 }
14837
14838 /* If padding with spaces: write sign if needed and/or numeric prefix if
14839 the alternate form is used */
14840 if (fill == ' ') {
14841 if (arg->sign) {
14842 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14843 writer->pos += 1;
14844 }
14845 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14846 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14847 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14848 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14849 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14850 writer->pos += 2;
14851 pindex += 2;
14852 }
14853 }
14854
14855 /* Write characters */
14856 if (len) {
14857 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14858 str, pindex, len);
14859 writer->pos += len;
14860 }
14861
14862 /* Pad right with the fill character if needed */
14863 if (arg->width > len) {
14864 sublen = arg->width - len;
14865 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14866 writer->pos += sublen;
14867 }
14868 return 0;
14869}
14870
14871/* Helper of PyUnicode_Format(): format one arg.
14872 Return 0 on success, raise an exception and return -1 on error. */
14873static int
14874unicode_format_arg(struct unicode_formatter_t *ctx)
14875{
14876 struct unicode_format_arg_t arg;
14877 PyObject *str;
14878 int ret;
14879
Victor Stinner8dbd4212012-12-04 09:30:24 +010014880 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014881 if (arg.ch == '%') {
14882 ctx->fmtpos++;
14883 ctx->fmtcnt--;
14884 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14885 return -1;
14886 return 0;
14887 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014888 arg.flags = 0;
14889 arg.width = -1;
14890 arg.prec = -1;
14891 arg.sign = 0;
14892 str = NULL;
14893
Victor Stinnera47082312012-10-04 02:19:54 +020014894 ret = unicode_format_arg_parse(ctx, &arg);
14895 if (ret == -1)
14896 return -1;
14897
14898 ret = unicode_format_arg_format(ctx, &arg, &str);
14899 if (ret == -1)
14900 return -1;
14901
14902 if (ret != 1) {
14903 ret = unicode_format_arg_output(ctx, &arg, str);
14904 Py_DECREF(str);
14905 if (ret == -1)
14906 return -1;
14907 }
14908
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014909 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014910 PyErr_SetString(PyExc_TypeError,
14911 "not all arguments converted during string formatting");
14912 return -1;
14913 }
14914 return 0;
14915}
14916
Alexander Belopolsky40018472011-02-26 01:02:56 +000014917PyObject *
14918PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014919{
Victor Stinnera47082312012-10-04 02:19:54 +020014920 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014921
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014923 PyErr_BadInternalCall();
14924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014925 }
Victor Stinnera47082312012-10-04 02:19:54 +020014926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014927 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014928 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014929
14930 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014931 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14932 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14933 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14934 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014935
Victor Stinner8f674cc2013-04-17 23:02:17 +020014936 _PyUnicodeWriter_Init(&ctx.writer);
14937 ctx.writer.min_length = ctx.fmtcnt + 100;
14938 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014939
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014941 ctx.arglen = PyTuple_Size(args);
14942 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 }
14944 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014945 ctx.arglen = -1;
14946 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014947 }
Victor Stinnera47082312012-10-04 02:19:54 +020014948 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014949 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014950 ctx.dict = args;
14951 else
14952 ctx.dict = NULL;
14953 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954
Victor Stinnera47082312012-10-04 02:19:54 +020014955 while (--ctx.fmtcnt >= 0) {
14956 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014957 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014958
14959 nonfmtpos = ctx.fmtpos++;
14960 while (ctx.fmtcnt >= 0 &&
14961 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14962 ctx.fmtpos++;
14963 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014964 }
Victor Stinnera47082312012-10-04 02:19:54 +020014965 if (ctx.fmtcnt < 0) {
14966 ctx.fmtpos--;
14967 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014968 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014969
Victor Stinnercfc4c132013-04-03 01:48:39 +020014970 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14971 nonfmtpos, ctx.fmtpos) < 0)
14972 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014973 }
14974 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014975 ctx.fmtpos++;
14976 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014977 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014978 }
14979 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014980
Victor Stinnera47082312012-10-04 02:19:54 +020014981 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014982 PyErr_SetString(PyExc_TypeError,
14983 "not all arguments converted during string formatting");
14984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014985 }
14986
Victor Stinnera47082312012-10-04 02:19:54 +020014987 if (ctx.args_owned) {
14988 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014989 }
Victor Stinnera47082312012-10-04 02:19:54 +020014990 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014991
Benjamin Peterson29060642009-01-31 22:14:21 +000014992 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014993 _PyUnicodeWriter_Dealloc(&ctx.writer);
14994 if (ctx.args_owned) {
14995 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014996 }
14997 return NULL;
14998}
14999
Jeremy Hylton938ace62002-07-17 16:30:39 +000015000static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015001unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15002
Tim Peters6d6c1a32001-08-02 04:15:00 +000015003static PyObject *
15004unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15005{
Benjamin Peterson29060642009-01-31 22:14:21 +000015006 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015007 static char *kwlist[] = {"object", "encoding", "errors", 0};
15008 char *encoding = NULL;
15009 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015010
Benjamin Peterson14339b62009-01-31 16:36:08 +000015011 if (type != &PyUnicode_Type)
15012 return unicode_subtype_new(type, args, kwds);
15013 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015014 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 return NULL;
15016 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015017 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015018 if (encoding == NULL && errors == NULL)
15019 return PyObject_Str(x);
15020 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015021 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015022}
15023
Guido van Rossume023fe02001-08-30 03:12:59 +000015024static PyObject *
15025unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15026{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015027 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015028 Py_ssize_t length, char_size;
15029 int share_wstr, share_utf8;
15030 unsigned int kind;
15031 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015032
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015034
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015035 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015036 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015038 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015039 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015040 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015042 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015043
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015044 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015045 if (self == NULL) {
15046 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 return NULL;
15048 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049 kind = PyUnicode_KIND(unicode);
15050 length = PyUnicode_GET_LENGTH(unicode);
15051
15052 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015053#ifdef Py_DEBUG
15054 _PyUnicode_HASH(self) = -1;
15055#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015057#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 _PyUnicode_STATE(self).interned = 0;
15059 _PyUnicode_STATE(self).kind = kind;
15060 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015061 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 _PyUnicode_STATE(self).ready = 1;
15063 _PyUnicode_WSTR(self) = NULL;
15064 _PyUnicode_UTF8_LENGTH(self) = 0;
15065 _PyUnicode_UTF8(self) = NULL;
15066 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015067 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068
15069 share_utf8 = 0;
15070 share_wstr = 0;
15071 if (kind == PyUnicode_1BYTE_KIND) {
15072 char_size = 1;
15073 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15074 share_utf8 = 1;
15075 }
15076 else if (kind == PyUnicode_2BYTE_KIND) {
15077 char_size = 2;
15078 if (sizeof(wchar_t) == 2)
15079 share_wstr = 1;
15080 }
15081 else {
15082 assert(kind == PyUnicode_4BYTE_KIND);
15083 char_size = 4;
15084 if (sizeof(wchar_t) == 4)
15085 share_wstr = 1;
15086 }
15087
15088 /* Ensure we won't overflow the length. */
15089 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15090 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015091 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015093 data = PyObject_MALLOC((length + 1) * char_size);
15094 if (data == NULL) {
15095 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015096 goto onError;
15097 }
15098
Victor Stinnerc3c74152011-10-02 20:39:55 +020015099 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015100 if (share_utf8) {
15101 _PyUnicode_UTF8_LENGTH(self) = length;
15102 _PyUnicode_UTF8(self) = data;
15103 }
15104 if (share_wstr) {
15105 _PyUnicode_WSTR_LENGTH(self) = length;
15106 _PyUnicode_WSTR(self) = (wchar_t *)data;
15107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015108
Christian Heimesf051e432016-09-13 20:22:02 +020015109 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015110 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015111 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015112#ifdef Py_DEBUG
15113 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15114#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015115 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015116 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015117
15118onError:
15119 Py_DECREF(unicode);
15120 Py_DECREF(self);
15121 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015122}
15123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015124PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015125"str(object='') -> str\n\
15126str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015127\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015128Create a new string object from the given object. If encoding or\n\
15129errors is specified, then the object must expose a data buffer\n\
15130that will be decoded using the given encoding and error handler.\n\
15131Otherwise, returns the result of object.__str__() (if defined)\n\
15132or repr(object).\n\
15133encoding defaults to sys.getdefaultencoding().\n\
15134errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015135
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015136static PyObject *unicode_iter(PyObject *seq);
15137
Guido van Rossumd57fd912000-03-10 22:53:23 +000015138PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015139 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 "str", /* tp_name */
15141 sizeof(PyUnicodeObject), /* tp_size */
15142 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015143 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 (destructor)unicode_dealloc, /* tp_dealloc */
15145 0, /* tp_print */
15146 0, /* tp_getattr */
15147 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015148 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 unicode_repr, /* tp_repr */
15150 &unicode_as_number, /* tp_as_number */
15151 &unicode_as_sequence, /* tp_as_sequence */
15152 &unicode_as_mapping, /* tp_as_mapping */
15153 (hashfunc) unicode_hash, /* tp_hash*/
15154 0, /* tp_call*/
15155 (reprfunc) unicode_str, /* tp_str */
15156 PyObject_GenericGetAttr, /* tp_getattro */
15157 0, /* tp_setattro */
15158 0, /* tp_as_buffer */
15159 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000015160 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015161 unicode_doc, /* tp_doc */
15162 0, /* tp_traverse */
15163 0, /* tp_clear */
15164 PyUnicode_RichCompare, /* tp_richcompare */
15165 0, /* tp_weaklistoffset */
15166 unicode_iter, /* tp_iter */
15167 0, /* tp_iternext */
15168 unicode_methods, /* tp_methods */
15169 0, /* tp_members */
15170 0, /* tp_getset */
15171 &PyBaseObject_Type, /* tp_base */
15172 0, /* tp_dict */
15173 0, /* tp_descr_get */
15174 0, /* tp_descr_set */
15175 0, /* tp_dictoffset */
15176 0, /* tp_init */
15177 0, /* tp_alloc */
15178 unicode_new, /* tp_new */
15179 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015180};
15181
15182/* Initialize the Unicode implementation */
15183
Victor Stinner3a50e702011-10-18 21:21:00 +020015184int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015185{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015186 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015187 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015188 0x000A, /* LINE FEED */
15189 0x000D, /* CARRIAGE RETURN */
15190 0x001C, /* FILE SEPARATOR */
15191 0x001D, /* GROUP SEPARATOR */
15192 0x001E, /* RECORD SEPARATOR */
15193 0x0085, /* NEXT LINE */
15194 0x2028, /* LINE SEPARATOR */
15195 0x2029, /* PARAGRAPH SEPARATOR */
15196 };
15197
Fred Drakee4315f52000-05-09 19:53:39 +000015198 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015199 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015200 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015201 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015202 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015203
Guido van Rossumcacfc072002-05-24 19:01:59 +000015204 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015205 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015206
15207 /* initialize the linebreak bloom filter */
15208 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015209 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015210 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015211
Christian Heimes26532f72013-07-20 14:57:16 +020015212 if (PyType_Ready(&EncodingMapType) < 0)
15213 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015214
Benjamin Petersonc4311282012-10-30 23:21:10 -040015215 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15216 Py_FatalError("Can't initialize field name iterator type");
15217
15218 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15219 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015220
Victor Stinner3a50e702011-10-18 21:21:00 +020015221 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015222}
15223
15224/* Finalize the Unicode implementation */
15225
Christian Heimesa156e092008-02-16 07:38:31 +000015226int
15227PyUnicode_ClearFreeList(void)
15228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015229 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015230}
15231
Guido van Rossumd57fd912000-03-10 22:53:23 +000015232void
Thomas Wouters78890102000-07-22 19:25:51 +000015233_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015234{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015235 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015236
Serhiy Storchaka05997252013-01-26 12:14:02 +020015237 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015238
Serhiy Storchaka05997252013-01-26 12:14:02 +020015239 for (i = 0; i < 256; i++)
15240 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015241 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015242 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015243}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015244
Walter Dörwald16807132007-05-25 13:52:07 +000015245void
15246PyUnicode_InternInPlace(PyObject **p)
15247{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015248 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015250#ifdef Py_DEBUG
15251 assert(s != NULL);
15252 assert(_PyUnicode_CHECK(s));
15253#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015255 return;
15256#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 /* If it's a subclass, we don't really know what putting
15258 it in the interned dict might do. */
15259 if (!PyUnicode_CheckExact(s))
15260 return;
15261 if (PyUnicode_CHECK_INTERNED(s))
15262 return;
15263 if (interned == NULL) {
15264 interned = PyDict_New();
15265 if (interned == NULL) {
15266 PyErr_Clear(); /* Don't leave an exception */
15267 return;
15268 }
15269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015271 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015273 if (t == NULL) {
15274 PyErr_Clear();
15275 return;
15276 }
15277 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015278 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015279 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015280 return;
15281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 /* The two references in interned are not counted by refcnt.
15283 The deallocator will take care of this */
15284 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015285 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015286}
15287
15288void
15289PyUnicode_InternImmortal(PyObject **p)
15290{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 PyUnicode_InternInPlace(p);
15292 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015293 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 Py_INCREF(*p);
15295 }
Walter Dörwald16807132007-05-25 13:52:07 +000015296}
15297
15298PyObject *
15299PyUnicode_InternFromString(const char *cp)
15300{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 PyObject *s = PyUnicode_FromString(cp);
15302 if (s == NULL)
15303 return NULL;
15304 PyUnicode_InternInPlace(&s);
15305 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015306}
15307
Alexander Belopolsky40018472011-02-26 01:02:56 +000015308void
15309_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015312 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 Py_ssize_t i, n;
15314 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015315
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 if (interned == NULL || !PyDict_Check(interned))
15317 return;
15318 keys = PyDict_Keys(interned);
15319 if (keys == NULL || !PyList_Check(keys)) {
15320 PyErr_Clear();
15321 return;
15322 }
Walter Dörwald16807132007-05-25 13:52:07 +000015323
Benjamin Peterson14339b62009-01-31 16:36:08 +000015324 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15325 detector, interned unicode strings are not forcibly deallocated;
15326 rather, we give them their stolen references back, and then clear
15327 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015328
Benjamin Peterson14339b62009-01-31 16:36:08 +000015329 n = PyList_GET_SIZE(keys);
15330 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015331 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015332 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015333 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015334 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015335 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015337 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015338 case SSTATE_NOT_INTERNED:
15339 /* XXX Shouldn't happen */
15340 break;
15341 case SSTATE_INTERNED_IMMORTAL:
15342 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 break;
15345 case SSTATE_INTERNED_MORTAL:
15346 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015347 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 break;
15349 default:
15350 Py_FatalError("Inconsistent interned string state.");
15351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015352 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 }
15354 fprintf(stderr, "total size of all interned strings: "
15355 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15356 "mortal/immortal\n", mortal_size, immortal_size);
15357 Py_DECREF(keys);
15358 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015359 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015360}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015361
15362
15363/********************* Unicode Iterator **************************/
15364
15365typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 PyObject_HEAD
15367 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015368 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015369} unicodeiterobject;
15370
15371static void
15372unicodeiter_dealloc(unicodeiterobject *it)
15373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 _PyObject_GC_UNTRACK(it);
15375 Py_XDECREF(it->it_seq);
15376 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015377}
15378
15379static int
15380unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15381{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 Py_VISIT(it->it_seq);
15383 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015384}
15385
15386static PyObject *
15387unicodeiter_next(unicodeiterobject *it)
15388{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015389 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015390
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 assert(it != NULL);
15392 seq = it->it_seq;
15393 if (seq == NULL)
15394 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015395 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015397 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15398 int kind = PyUnicode_KIND(seq);
15399 void *data = PyUnicode_DATA(seq);
15400 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15401 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 if (item != NULL)
15403 ++it->it_index;
15404 return item;
15405 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015406
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015408 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015410}
15411
15412static PyObject *
15413unicodeiter_len(unicodeiterobject *it)
15414{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 Py_ssize_t len = 0;
15416 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015417 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015419}
15420
15421PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15422
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015423static PyObject *
15424unicodeiter_reduce(unicodeiterobject *it)
15425{
15426 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015427 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015428 it->it_seq, it->it_index);
15429 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015430 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015431 if (u == NULL)
15432 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015433 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015434 }
15435}
15436
15437PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15438
15439static PyObject *
15440unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15441{
15442 Py_ssize_t index = PyLong_AsSsize_t(state);
15443 if (index == -1 && PyErr_Occurred())
15444 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015445 if (it->it_seq != NULL) {
15446 if (index < 0)
15447 index = 0;
15448 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15449 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15450 it->it_index = index;
15451 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015452 Py_RETURN_NONE;
15453}
15454
15455PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15456
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015457static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015458 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015459 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15461 reduce_doc},
15462 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15463 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015464 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015465};
15466
15467PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15469 "str_iterator", /* tp_name */
15470 sizeof(unicodeiterobject), /* tp_basicsize */
15471 0, /* tp_itemsize */
15472 /* methods */
15473 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15474 0, /* tp_print */
15475 0, /* tp_getattr */
15476 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015477 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015478 0, /* tp_repr */
15479 0, /* tp_as_number */
15480 0, /* tp_as_sequence */
15481 0, /* tp_as_mapping */
15482 0, /* tp_hash */
15483 0, /* tp_call */
15484 0, /* tp_str */
15485 PyObject_GenericGetAttr, /* tp_getattro */
15486 0, /* tp_setattro */
15487 0, /* tp_as_buffer */
15488 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15489 0, /* tp_doc */
15490 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15491 0, /* tp_clear */
15492 0, /* tp_richcompare */
15493 0, /* tp_weaklistoffset */
15494 PyObject_SelfIter, /* tp_iter */
15495 (iternextfunc)unicodeiter_next, /* tp_iternext */
15496 unicodeiter_methods, /* tp_methods */
15497 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015498};
15499
15500static PyObject *
15501unicode_iter(PyObject *seq)
15502{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015504
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 if (!PyUnicode_Check(seq)) {
15506 PyErr_BadInternalCall();
15507 return NULL;
15508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015509 if (PyUnicode_READY(seq) == -1)
15510 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15512 if (it == NULL)
15513 return NULL;
15514 it->it_index = 0;
15515 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015516 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 _PyObject_GC_TRACK(it);
15518 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015519}
15520
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015521
15522size_t
15523Py_UNICODE_strlen(const Py_UNICODE *u)
15524{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015525 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015526}
15527
15528Py_UNICODE*
15529Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15530{
15531 Py_UNICODE *u = s1;
15532 while ((*u++ = *s2++));
15533 return s1;
15534}
15535
15536Py_UNICODE*
15537Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15538{
15539 Py_UNICODE *u = s1;
15540 while ((*u++ = *s2++))
15541 if (n-- == 0)
15542 break;
15543 return s1;
15544}
15545
15546Py_UNICODE*
15547Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15548{
15549 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015550 u1 += wcslen(u1);
15551 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015552 return s1;
15553}
15554
15555int
15556Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15557{
15558 while (*s1 && *s2 && *s1 == *s2)
15559 s1++, s2++;
15560 if (*s1 && *s2)
15561 return (*s1 < *s2) ? -1 : +1;
15562 if (*s1)
15563 return 1;
15564 if (*s2)
15565 return -1;
15566 return 0;
15567}
15568
15569int
15570Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15571{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015572 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015573 for (; n != 0; n--) {
15574 u1 = *s1;
15575 u2 = *s2;
15576 if (u1 != u2)
15577 return (u1 < u2) ? -1 : +1;
15578 if (u1 == '\0')
15579 return 0;
15580 s1++;
15581 s2++;
15582 }
15583 return 0;
15584}
15585
15586Py_UNICODE*
15587Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15588{
15589 const Py_UNICODE *p;
15590 for (p = s; *p; p++)
15591 if (*p == c)
15592 return (Py_UNICODE*)p;
15593 return NULL;
15594}
15595
15596Py_UNICODE*
15597Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15598{
15599 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015600 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015601 while (p != s) {
15602 p--;
15603 if (*p == c)
15604 return (Py_UNICODE*)p;
15605 }
15606 return NULL;
15607}
Victor Stinner331ea922010-08-10 16:37:20 +000015608
Victor Stinner71133ff2010-09-01 23:43:53 +000015609Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015610PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015611{
Victor Stinner577db2c2011-10-11 22:12:48 +020015612 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015613 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015615 if (!PyUnicode_Check(unicode)) {
15616 PyErr_BadArgument();
15617 return NULL;
15618 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015619 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015620 if (u == NULL)
15621 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015622 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015623 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015624 PyErr_NoMemory();
15625 return NULL;
15626 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015627 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015628 size *= sizeof(Py_UNICODE);
15629 copy = PyMem_Malloc(size);
15630 if (copy == NULL) {
15631 PyErr_NoMemory();
15632 return NULL;
15633 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015634 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015635 return copy;
15636}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015637
Georg Brandl66c221e2010-10-14 07:04:07 +000015638/* A _string module, to export formatter_parser and formatter_field_name_split
15639 to the string.Formatter class implemented in Python. */
15640
15641static PyMethodDef _string_methods[] = {
15642 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15643 METH_O, PyDoc_STR("split the argument as a field name")},
15644 {"formatter_parser", (PyCFunction) formatter_parser,
15645 METH_O, PyDoc_STR("parse the argument as a format string")},
15646 {NULL, NULL}
15647};
15648
15649static struct PyModuleDef _string_module = {
15650 PyModuleDef_HEAD_INIT,
15651 "_string",
15652 PyDoc_STR("string helper module"),
15653 0,
15654 _string_methods,
15655 NULL,
15656 NULL,
15657 NULL,
15658 NULL
15659};
15660
15661PyMODINIT_FUNC
15662PyInit__string(void)
15663{
15664 return PyModule_Create(&_string_module);
15665}
15666
15667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015668#ifdef __cplusplus
15669}
15670#endif